diff --git a/data/helm_capabilities/allenai/olmo-2-0325-32b-instruct/18881f8b-b06e-4317-b697-6eadb975077c.json b/data/helm_capabilities/allenai/olmo-2-0325-32b-instruct/8948bfb0-cc9d-40f7-a02d-d5c9611436d8.json similarity index 75% rename from data/helm_capabilities/allenai/olmo-2-0325-32b-instruct/18881f8b-b06e-4317-b697-6eadb975077c.json rename to data/helm_capabilities/allenai/olmo-2-0325-32b-instruct/8948bfb0-cc9d-40f7-a02d-d5c9611436d8.json index 42f19b810..8176fa91a 100644 --- a/data/helm_capabilities/allenai/olmo-2-0325-32b-instruct/18881f8b-b06e-4317-b697-6eadb975077c.json +++ b/data/helm_capabilities/allenai/olmo-2-0325-32b-instruct/8948bfb0-cc9d-40f7-a02d-d5c9611436d8.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/allenai_olmo-2-0325-32b-instruct/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/allenai_olmo-2-0325-32b-instruct/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.475, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,12 +42,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -88,16 +100,25 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false", - "num_output_tokens": "2048" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false", + "num_output_tokens": "2048" + } } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -141,16 +162,25 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false", - "num_output_tokens": "2048" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false", + "num_output_tokens": "2048" + } } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -194,13 +224,22 @@ } }, "generation_config": { - "num_output_tokens": "2048" + "additional_details": { + "num_output_tokens": "2048" + } } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -244,14 +283,23 @@ } }, "generation_config": { - "subset": "v2", - "num_output_tokens": "2048" + "additional_details": { + "subset": "v2", + "num_output_tokens": "2048" + } } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -295,7 +343,9 @@ } }, "generation_config": { - "num_output_tokens": "2048" + "additional_details": { + "num_output_tokens": "2048" + } } } ] diff --git a/data/helm_capabilities/allenai/olmo-2-1124-13b-instruct/97db1a8d-b7d8-4481-82fb-dc0c6396edac.json b/data/helm_capabilities/allenai/olmo-2-1124-13b-instruct/7d2d1dba-1b31-47b2-8308-f2668cf36c99.json similarity index 75% rename from data/helm_capabilities/allenai/olmo-2-1124-13b-instruct/97db1a8d-b7d8-4481-82fb-dc0c6396edac.json rename to data/helm_capabilities/allenai/olmo-2-1124-13b-instruct/7d2d1dba-1b31-47b2-8308-f2668cf36c99.json index c596a8093..4d2b264af 100644 --- a/data/helm_capabilities/allenai/olmo-2-1124-13b-instruct/97db1a8d-b7d8-4481-82fb-dc0c6396edac.json +++ b/data/helm_capabilities/allenai/olmo-2-1124-13b-instruct/7d2d1dba-1b31-47b2-8308-f2668cf36c99.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/allenai_olmo-2-1124-13b-instruct/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/allenai_olmo-2-1124-13b-instruct/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.44, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,12 +42,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -88,16 +100,25 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false", - "num_output_tokens": "2048" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false", + "num_output_tokens": "2048" + } } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -141,16 +162,25 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false", - "num_output_tokens": "2048" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false", + "num_output_tokens": "2048" + } } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -194,13 +224,22 @@ } }, "generation_config": { - "num_output_tokens": "2048" + "additional_details": { + "num_output_tokens": "2048" + } } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -244,14 +283,23 @@ } }, "generation_config": { - "subset": "v2", - "num_output_tokens": "2048" + "additional_details": { + "subset": "v2", + "num_output_tokens": "2048" + } } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -295,7 +343,9 @@ } }, "generation_config": { - "num_output_tokens": "2048" + "additional_details": { + "num_output_tokens": "2048" + } } } ] diff --git a/data/helm_capabilities/allenai/olmo-2-1124-7b-instruct/8d29f447-01d8-4fae-87d5-b4386ce5239a.json b/data/helm_capabilities/allenai/olmo-2-1124-7b-instruct/3a056f7b-1bdf-4543-9e67-1101ace67179.json similarity index 75% rename from data/helm_capabilities/allenai/olmo-2-1124-7b-instruct/8d29f447-01d8-4fae-87d5-b4386ce5239a.json rename to data/helm_capabilities/allenai/olmo-2-1124-7b-instruct/3a056f7b-1bdf-4543-9e67-1101ace67179.json index da8bb1b91..39fbc0d1c 100644 --- a/data/helm_capabilities/allenai/olmo-2-1124-7b-instruct/8d29f447-01d8-4fae-87d5-b4386ce5239a.json +++ b/data/helm_capabilities/allenai/olmo-2-1124-7b-instruct/3a056f7b-1bdf-4543-9e67-1101ace67179.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/allenai_olmo-2-1124-7b-instruct/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/allenai_olmo-2-1124-7b-instruct/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.405, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,12 +42,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -88,16 +100,25 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false", - "num_output_tokens": "2048" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false", + "num_output_tokens": "2048" + } } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -141,16 +162,25 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false", - "num_output_tokens": "2048" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false", + "num_output_tokens": "2048" + } } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -194,13 +224,22 @@ } }, "generation_config": { - "num_output_tokens": "2048" + "additional_details": { + "num_output_tokens": "2048" + } } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -244,14 +283,23 @@ } }, "generation_config": { - "subset": "v2", - "num_output_tokens": "2048" + "additional_details": { + "subset": "v2", + "num_output_tokens": "2048" + } } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -295,7 +343,9 @@ } }, "generation_config": { - "num_output_tokens": "2048" + "additional_details": { + "num_output_tokens": "2048" + } } } ] diff --git a/data/helm_capabilities/allenai/olmoe-1b-7b-0125-instruct/53090373-ea82-4b63-83fd-f1d48f0400cd.json b/data/helm_capabilities/allenai/olmoe-1b-7b-0125-instruct/275cf2e5-5ccd-40be-be55-938c82ef6688.json similarity index 75% rename from data/helm_capabilities/allenai/olmoe-1b-7b-0125-instruct/53090373-ea82-4b63-83fd-f1d48f0400cd.json rename to data/helm_capabilities/allenai/olmoe-1b-7b-0125-instruct/275cf2e5-5ccd-40be-be55-938c82ef6688.json index cb4638d3d..99d31c069 100644 --- a/data/helm_capabilities/allenai/olmoe-1b-7b-0125-instruct/53090373-ea82-4b63-83fd-f1d48f0400cd.json +++ b/data/helm_capabilities/allenai/olmoe-1b-7b-0125-instruct/275cf2e5-5ccd-40be-be55-938c82ef6688.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/allenai_olmoe-1b-7b-0125-instruct/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/allenai_olmoe-1b-7b-0125-instruct/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.332, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,12 +42,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -88,16 +100,25 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false", - "num_output_tokens": "2048" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false", + "num_output_tokens": "2048" + } } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -141,16 +162,25 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false", - "num_output_tokens": "2048" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false", + "num_output_tokens": "2048" + } } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -194,13 +224,22 @@ } }, "generation_config": { - "num_output_tokens": "2048" + "additional_details": { + "num_output_tokens": "2048" + } } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -244,14 +283,23 @@ } }, "generation_config": { - "subset": "v2", - "num_output_tokens": "2048" + "additional_details": { + "subset": "v2", + "num_output_tokens": "2048" + } } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -295,7 +343,9 @@ } }, "generation_config": { - "num_output_tokens": "2048" + "additional_details": { + "num_output_tokens": "2048" + } } } ] diff --git a/data/helm_capabilities/amazon/nova-lite-v1:0/6665062e-03c1-4758-8858-1184405a3538.json b/data/helm_capabilities/amazon/nova-lite-v1_0/43e7be99-4872-4eb1-b30b-75c44b298ab4.json similarity index 75% rename from data/helm_capabilities/amazon/nova-lite-v1:0/6665062e-03c1-4758-8858-1184405a3538.json rename to data/helm_capabilities/amazon/nova-lite-v1_0/43e7be99-4872-4eb1-b30b-75c44b298ab4.json index 0670c6db3..c786f36c7 100644 --- a/data/helm_capabilities/amazon/nova-lite-v1:0/6665062e-03c1-4758-8858-1184405a3538.json +++ b/data/helm_capabilities/amazon/nova-lite-v1_0/43e7be99-4872-4eb1-b30b-75c44b298ab4.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/amazon_nova-lite-v1:0/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/amazon_nova-lite-v1:0/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.551, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,12 +42,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -88,15 +100,24 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -140,15 +161,24 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -191,12 +221,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -240,13 +279,22 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/amazon/nova-micro-v1:0/5f53ac6c-1d10-4f07-acc3-d622c5360168.json b/data/helm_capabilities/amazon/nova-micro-v1_0/cfc99298-4570-48cf-9187-aa0d167cc0ba.json similarity index 75% rename from data/helm_capabilities/amazon/nova-micro-v1:0/5f53ac6c-1d10-4f07-acc3-d622c5360168.json rename to data/helm_capabilities/amazon/nova-micro-v1_0/cfc99298-4570-48cf-9187-aa0d167cc0ba.json index 2c6f0abd0..6219cdf47 100644 --- a/data/helm_capabilities/amazon/nova-micro-v1:0/5f53ac6c-1d10-4f07-acc3-d622c5360168.json +++ b/data/helm_capabilities/amazon/nova-micro-v1_0/cfc99298-4570-48cf-9187-aa0d167cc0ba.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/amazon_nova-micro-v1:0/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/amazon_nova-micro-v1:0/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.522, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,12 +42,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -88,15 +100,24 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -140,15 +161,24 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -191,12 +221,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -240,13 +279,22 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/amazon/nova-premier-v1:0/bcfec13c-8645-4ad2-a746-67e951e07aa9.json b/data/helm_capabilities/amazon/nova-premier-v1_0/a2162367-d16d-4274-aa89-43435cea5c0b.json similarity index 76% rename from data/helm_capabilities/amazon/nova-premier-v1:0/bcfec13c-8645-4ad2-a746-67e951e07aa9.json rename to data/helm_capabilities/amazon/nova-premier-v1_0/a2162367-d16d-4274-aa89-43435cea5c0b.json index 3a64b94b2..d9f1bd857 100644 --- a/data/helm_capabilities/amazon/nova-premier-v1:0/bcfec13c-8645-4ad2-a746-67e951e07aa9.json +++ b/data/helm_capabilities/amazon/nova-premier-v1_0/a2162367-d16d-4274-aa89-43435cea5c0b.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/amazon_nova-premier-v1:0/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/amazon_nova-premier-v1:0/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.637, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,12 +42,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -88,15 +100,24 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -140,15 +161,24 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -191,12 +221,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -240,13 +279,22 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/amazon/nova-pro-v1:0/b8fb264c-9d58-4a55-8b48-c3f2e116828d.json b/data/helm_capabilities/amazon/nova-pro-v1_0/51ef4580-da13-415a-a37f-45e2036ed4c2.json similarity index 75% rename from data/helm_capabilities/amazon/nova-pro-v1:0/b8fb264c-9d58-4a55-8b48-c3f2e116828d.json rename to data/helm_capabilities/amazon/nova-pro-v1_0/51ef4580-da13-415a-a37f-45e2036ed4c2.json index bbdb8512b..658945ff5 100644 --- a/data/helm_capabilities/amazon/nova-pro-v1:0/b8fb264c-9d58-4a55-8b48-c3f2e116828d.json +++ b/data/helm_capabilities/amazon/nova-pro-v1_0/51ef4580-da13-415a-a37f-45e2036ed4c2.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/amazon_nova-pro-v1:0/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/amazon_nova-pro-v1:0/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.591, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,12 +42,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -88,15 +100,24 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -140,15 +161,24 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -191,12 +221,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -240,13 +279,22 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/anthropic/claude-3-5-haiku-20241022/568969ac-4b9a-42b0-8374-2b28dde30a3c.json b/data/helm_capabilities/anthropic/claude-3-5-haiku-20241022/3fa605db-fcff-4f05-9398-6af77c9dcada.json similarity index 76% rename from data/helm_capabilities/anthropic/claude-3-5-haiku-20241022/568969ac-4b9a-42b0-8374-2b28dde30a3c.json rename to data/helm_capabilities/anthropic/claude-3-5-haiku-20241022/3fa605db-fcff-4f05-9398-6af77c9dcada.json index 44b7ab97a..d63e271d1 100644 --- a/data/helm_capabilities/anthropic/claude-3-5-haiku-20241022/568969ac-4b9a-42b0-8374-2b28dde30a3c.json +++ b/data/helm_capabilities/anthropic/claude-3-5-haiku-20241022/3fa605db-fcff-4f05-9398-6af77c9dcada.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/anthropic_claude-3-5-haiku-20241022/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/anthropic_claude-3-5-haiku-20241022/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Claude 3.5 Haiku (20241022)", + "name": "Claude 3.5 Haiku 20241022", "id": "anthropic/claude-3-5-haiku-20241022", "developer": "anthropic", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.549, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,12 +42,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -88,15 +100,24 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -140,15 +161,24 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -191,12 +221,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -240,13 +279,22 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/anthropic/claude-3-5-sonnet-20241022/c6b92f00-6335-463d-87db-817ff85f36c8.json b/data/helm_capabilities/anthropic/claude-3-5-sonnet-20241022/9d58ac39-fef7-47c8-920a-8be2069f5662.json similarity index 76% rename from data/helm_capabilities/anthropic/claude-3-5-sonnet-20241022/c6b92f00-6335-463d-87db-817ff85f36c8.json rename to data/helm_capabilities/anthropic/claude-3-5-sonnet-20241022/9d58ac39-fef7-47c8-920a-8be2069f5662.json index b8e94bdb5..c53a3aa66 100644 --- a/data/helm_capabilities/anthropic/claude-3-5-sonnet-20241022/c6b92f00-6335-463d-87db-817ff85f36c8.json +++ b/data/helm_capabilities/anthropic/claude-3-5-sonnet-20241022/9d58ac39-fef7-47c8-920a-8be2069f5662.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/anthropic_claude-3-5-sonnet-20241022/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/anthropic_claude-3-5-sonnet-20241022/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Claude 3.5 Sonnet (20241022)", + "name": "Claude 3.5 Sonnet 20241022", "id": "anthropic/claude-3-5-sonnet-20241022", "developer": "anthropic", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.653, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,12 +42,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -88,15 +100,24 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -140,15 +161,24 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -191,12 +221,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -240,13 +279,22 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/anthropic/claude-3-7-sonnet-20250219/460fdbd2-a164-4af4-95ff-db66e381ca0c.json b/data/helm_capabilities/anthropic/claude-3-7-sonnet-20250219/dd9b10af-ad39-45ef-8f91-097340d376c7.json similarity index 76% rename from data/helm_capabilities/anthropic/claude-3-7-sonnet-20250219/460fdbd2-a164-4af4-95ff-db66e381ca0c.json rename to data/helm_capabilities/anthropic/claude-3-7-sonnet-20250219/dd9b10af-ad39-45ef-8f91-097340d376c7.json index a41bf85dc..1f5c52f66 100644 --- a/data/helm_capabilities/anthropic/claude-3-7-sonnet-20250219/460fdbd2-a164-4af4-95ff-db66e381ca0c.json +++ b/data/helm_capabilities/anthropic/claude-3-7-sonnet-20250219/dd9b10af-ad39-45ef-8f91-097340d376c7.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/anthropic_claude-3-7-sonnet-20250219/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/anthropic_claude-3-7-sonnet-20250219/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Claude 3.7 Sonnet (20250219)", + "name": "Claude 3.7 Sonnet 20250219", "id": "anthropic/claude-3-7-sonnet-20250219", "developer": "anthropic", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.674, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,12 +42,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -88,15 +100,24 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -140,15 +161,24 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -191,12 +221,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -240,13 +279,22 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/anthropic/claude-haiku-4-5-20251001/30a6de14-c57c-483e-92e9-26fc4c7f4772.json b/data/helm_capabilities/anthropic/claude-haiku-4-5-20251001/30a6de14-c57c-483e-92e9-26fc4c7f4772.json new file mode 100644 index 000000000..da15e55a7 --- /dev/null +++ b/data/helm_capabilities/anthropic/claude-haiku-4-5-20251001/30a6de14-c57c-483e-92e9-26fc4c7f4772.json @@ -0,0 +1,345 @@ +{ + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/anthropic_claude-haiku-4-5-20251001/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", + "source_metadata": { + "source_name": "helm_capabilities", + "source_type": "documentation", + "source_organization_name": "crfm", + "evaluator_relationship": "third_party" + }, + "model_info": { + "name": "Claude 4.5 Haiku 20251001", + "id": "anthropic/claude-haiku-4-5-20251001", + "developer": "anthropic", + "inference_platform": "unknown" + }, + "evaluation_results": [ + { + "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "The mean of the scores from all columns.", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.717, + "details": { + "tab": "Accuracy", + "Mean score - Efficiency": { + "description": null, + "tab": "Efficiency", + "score": 7.381503096938465 + } + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "MMLU-Pro", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "COT correct on MMLU-Pro", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.777, + "details": { + "description": "min=0.777, mean=0.777, max=0.777, sum=0.777 (1)", + "tab": "Accuracy", + "MMLU-Pro - Observed inference time (s)": { + "description": "min=3.701, mean=3.701, max=3.701, sum=3.701 (1)", + "tab": "Efficiency", + "score": 3.7008020806312563 + }, + "MMLU-Pro - # eval": { + "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", + "tab": "General information", + "score": 1000.0 + }, + "MMLU-Pro - # train": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "MMLU-Pro - truncated": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "MMLU-Pro - # prompt tokens": { + "description": "min=252.461, mean=252.461, max=252.461, sum=252.461 (1)", + "tab": "General information", + "score": 252.461 + }, + "MMLU-Pro - # output tokens": { + "description": "min=374.129, mean=374.129, max=374.129, sum=374.129 (1)", + "tab": "General information", + "score": 374.129 + } + } + }, + "generation_config": { + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } + } + }, + { + "evaluation_name": "GPQA", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "COT correct on GPQA", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.605, + "details": { + "description": "min=0.605, mean=0.605, max=0.605, sum=0.605 (1)", + "tab": "Accuracy", + "GPQA - Observed inference time (s)": { + "description": "min=5.102, mean=5.102, max=5.102, sum=5.102 (1)", + "tab": "Efficiency", + "score": 5.102193982611857 + }, + "GPQA - # eval": { + "description": "min=446, mean=446, max=446, sum=446 (1)", + "tab": "General information", + "score": 446.0 + }, + "GPQA - # train": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "GPQA - truncated": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "GPQA - # prompt tokens": { + "description": "min=272.738, mean=272.738, max=272.738, sum=272.738 (1)", + "tab": "General information", + "score": 272.73766816143495 + }, + "GPQA - # output tokens": { + "description": "min=524.525, mean=524.525, max=524.525, sum=524.525 (1)", + "tab": "General information", + "score": 524.5246636771301 + } + } + }, + "generation_config": { + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } + } + }, + { + "evaluation_name": "IFEval", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "IFEval Strict Acc on IFEval", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.801, + "details": { + "description": "min=0.801, mean=0.801, max=0.801, sum=0.801 (1)", + "tab": "Accuracy", + "IFEval - Observed inference time (s)": { + "description": "min=4.355, mean=4.355, max=4.355, sum=4.355 (1)", + "tab": "Efficiency", + "score": 4.355410516372229 + }, + "IFEval - # eval": { + "description": "min=541, mean=541, max=541, sum=541 (1)", + "tab": "General information", + "score": 541.0 + }, + "IFEval - # train": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "IFEval - truncated": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "IFEval - # prompt tokens": { + "description": "min=47.159, mean=47.159, max=47.159, sum=47.159 (1)", + "tab": "General information", + "score": 47.15896487985213 + }, + "IFEval - # output tokens": { + "description": "min=390.416, mean=390.416, max=390.416, sum=390.416 (1)", + "tab": "General information", + "score": 390.4158964879852 + } + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "WildBench", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "WB Score on WildBench", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.839, + "details": { + "description": "min=0.839, mean=0.839, max=0.839, sum=0.839 (1)", + "tab": "Accuracy", + "WildBench - Observed inference time (s)": { + "description": "min=16.317, mean=16.317, max=16.317, sum=16.317 (1)", + "tab": "Efficiency", + "score": 16.317131044387818 + }, + "WildBench - # eval": { + "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", + "tab": "General information", + "score": 1000.0 + }, + "WildBench - # train": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "WildBench - truncated": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "WildBench - # prompt tokens": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "WildBench - # output tokens": { + "description": "min=1835.337, mean=1835.337, max=1835.337, sum=1835.337 (1)", + "tab": "General information", + "score": 1835.337 + } + } + }, + "generation_config": { + "additional_details": { + "subset": "v2" + } + } + }, + { + "evaluation_name": "Omni-MATH", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "Acc on Omni-MATH", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.561, + "details": { + "description": "min=0.561, mean=0.561, max=0.561, sum=0.561 (1)", + "tab": "Accuracy", + "Omni-MATH - Observed inference time (s)": { + "description": "min=7.432, mean=7.432, max=7.432, sum=7.432 (1)", + "tab": "Efficiency", + "score": 7.431977860689163 + }, + "Omni-MATH - # eval": { + "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", + "tab": "General information", + "score": 1000.0 + }, + "Omni-MATH - # train": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "Omni-MATH - truncated": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "Omni-MATH - # prompt tokens": { + "description": "min=110.563, mean=110.563, max=110.563, sum=110.563 (1)", + "tab": "General information", + "score": 110.563 + }, + "Omni-MATH - # output tokens": { + "description": "min=937.799, mean=937.799, max=937.799, sum=937.799 (1)", + "tab": "General information", + "score": 937.799 + } + } + }, + "generation_config": { + "additional_details": {} + } + } + ] +} \ No newline at end of file diff --git a/data/helm_capabilities/anthropic/claude-opus-4-20250514-thinking-10k/cb21169b-04ff-47d1-92dd-5b5f2e09b863.json b/data/helm_capabilities/anthropic/claude-opus-4-20250514-thinking-10k/bed1a799-77a6-40a1-9f37-d54fe9d4d055.json similarity index 76% rename from data/helm_capabilities/anthropic/claude-opus-4-20250514-thinking-10k/cb21169b-04ff-47d1-92dd-5b5f2e09b863.json rename to data/helm_capabilities/anthropic/claude-opus-4-20250514-thinking-10k/bed1a799-77a6-40a1-9f37-d54fe9d4d055.json index a9349e9cb..c554c6a65 100644 --- a/data/helm_capabilities/anthropic/claude-opus-4-20250514-thinking-10k/cb21169b-04ff-47d1-92dd-5b5f2e09b863.json +++ b/data/helm_capabilities/anthropic/claude-opus-4-20250514-thinking-10k/bed1a799-77a6-40a1-9f37-d54fe9d4d055.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/anthropic_claude-opus-4-20250514-thinking-10k/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/anthropic_claude-opus-4-20250514-thinking-10k/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Claude 4 Opus (20250514, extended thinking)", + "name": "Claude 4 Opus 20250514, extended thinking", "id": "anthropic/claude-opus-4-20250514-thinking-10k", "developer": "anthropic", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.78, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,12 +42,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -88,15 +100,24 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -140,15 +161,24 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -191,12 +221,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -240,13 +279,22 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/anthropic/claude-opus-4-20250514/2168d830-ad6b-4aee-94f0-7ec8fd403a49.json b/data/helm_capabilities/anthropic/claude-opus-4-20250514/6c226cad-23f1-4c09-8038-eb7b776cdee4.json similarity index 76% rename from data/helm_capabilities/anthropic/claude-opus-4-20250514/2168d830-ad6b-4aee-94f0-7ec8fd403a49.json rename to data/helm_capabilities/anthropic/claude-opus-4-20250514/6c226cad-23f1-4c09-8038-eb7b776cdee4.json index c82ca8963..240e9ebf4 100644 --- a/data/helm_capabilities/anthropic/claude-opus-4-20250514/2168d830-ad6b-4aee-94f0-7ec8fd403a49.json +++ b/data/helm_capabilities/anthropic/claude-opus-4-20250514/6c226cad-23f1-4c09-8038-eb7b776cdee4.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/anthropic_claude-opus-4-20250514/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/anthropic_claude-opus-4-20250514/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Claude 4 Opus (20250514)", + "name": "Claude 4 Opus 20250514", "id": "anthropic/claude-opus-4-20250514", "developer": "anthropic", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.757, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,12 +42,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -88,15 +100,24 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -140,15 +161,24 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -191,12 +221,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -240,13 +279,22 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/anthropic/claude-sonnet-4-20250514-thinking-10k/a5242cb1-b0fb-464f-ba7c-2d92deea03d3.json b/data/helm_capabilities/anthropic/claude-sonnet-4-20250514-thinking-10k/98887061-09d6-44ba-9cff-0267045a26ef.json similarity index 76% rename from data/helm_capabilities/anthropic/claude-sonnet-4-20250514-thinking-10k/a5242cb1-b0fb-464f-ba7c-2d92deea03d3.json rename to data/helm_capabilities/anthropic/claude-sonnet-4-20250514-thinking-10k/98887061-09d6-44ba-9cff-0267045a26ef.json index 6bf01f358..ecc6c0f0a 100644 --- a/data/helm_capabilities/anthropic/claude-sonnet-4-20250514-thinking-10k/a5242cb1-b0fb-464f-ba7c-2d92deea03d3.json +++ b/data/helm_capabilities/anthropic/claude-sonnet-4-20250514-thinking-10k/98887061-09d6-44ba-9cff-0267045a26ef.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/anthropic_claude-sonnet-4-20250514-thinking-10k/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/anthropic_claude-sonnet-4-20250514-thinking-10k/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Claude 4 Sonnet (20250514, extended thinking)", + "name": "Claude 4 Sonnet 20250514, extended thinking", "id": "anthropic/claude-sonnet-4-20250514-thinking-10k", "developer": "anthropic", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.766, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,12 +42,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -88,15 +100,24 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -140,15 +161,24 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -191,12 +221,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -240,13 +279,22 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/anthropic/claude-sonnet-4-20250514/629d5de7-25ed-4088-aca6-7fb53719f4a4.json b/data/helm_capabilities/anthropic/claude-sonnet-4-20250514/6693f0e2-3514-413d-be61-d10f7372b3dc.json similarity index 76% rename from data/helm_capabilities/anthropic/claude-sonnet-4-20250514/629d5de7-25ed-4088-aca6-7fb53719f4a4.json rename to data/helm_capabilities/anthropic/claude-sonnet-4-20250514/6693f0e2-3514-413d-be61-d10f7372b3dc.json index af4facce4..b4413ccdd 100644 --- a/data/helm_capabilities/anthropic/claude-sonnet-4-20250514/629d5de7-25ed-4088-aca6-7fb53719f4a4.json +++ b/data/helm_capabilities/anthropic/claude-sonnet-4-20250514/6693f0e2-3514-413d-be61-d10f7372b3dc.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/anthropic_claude-sonnet-4-20250514/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/anthropic_claude-sonnet-4-20250514/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Claude 4 Sonnet (20250514)", + "name": "Claude 4 Sonnet 20250514", "id": "anthropic/claude-sonnet-4-20250514", "developer": "anthropic", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.733, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,12 +42,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -88,15 +100,24 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -140,15 +161,24 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -191,12 +221,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -240,13 +279,22 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/anthropic/claude-sonnet-4-5-20250929/ffeaa0b2-fcdb-45dd-a6b4-06b67b9f63fe.json b/data/helm_capabilities/anthropic/claude-sonnet-4-5-20250929/ffeaa0b2-fcdb-45dd-a6b4-06b67b9f63fe.json new file mode 100644 index 000000000..e0991c0d9 --- /dev/null +++ b/data/helm_capabilities/anthropic/claude-sonnet-4-5-20250929/ffeaa0b2-fcdb-45dd-a6b4-06b67b9f63fe.json @@ -0,0 +1,345 @@ +{ + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/anthropic_claude-sonnet-4-5-20250929/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", + "source_metadata": { + "source_name": "helm_capabilities", + "source_type": "documentation", + "source_organization_name": "crfm", + "evaluator_relationship": "third_party" + }, + "model_info": { + "name": "Claude 4.5 Sonnet 20250929", + "id": "anthropic/claude-sonnet-4-5-20250929", + "developer": "anthropic", + "inference_platform": "unknown" + }, + "evaluation_results": [ + { + "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "The mean of the scores from all columns.", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.762, + "details": { + "tab": "Accuracy", + "Mean score - Efficiency": { + "description": null, + "tab": "Efficiency", + "score": 17.536448448412127 + } + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "MMLU-Pro", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "COT correct on MMLU-Pro", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.869, + "details": { + "description": "min=0.869, mean=0.869, max=0.869, sum=0.869 (1)", + "tab": "Accuracy", + "MMLU-Pro - Observed inference time (s)": { + "description": "min=9.03, mean=9.03, max=9.03, sum=9.03 (1)", + "tab": "Efficiency", + "score": 9.029817205530268 + }, + "MMLU-Pro - # eval": { + "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", + "tab": "General information", + "score": 1000.0 + }, + "MMLU-Pro - # train": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "MMLU-Pro - truncated": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "MMLU-Pro - # prompt tokens": { + "description": "min=252.461, mean=252.461, max=252.461, sum=252.461 (1)", + "tab": "General information", + "score": 252.461 + }, + "MMLU-Pro - # output tokens": { + "description": "min=392.292, mean=392.292, max=392.292, sum=392.292 (1)", + "tab": "General information", + "score": 392.292 + } + } + }, + "generation_config": { + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } + } + }, + { + "evaluation_name": "GPQA", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "COT correct on GPQA", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.686, + "details": { + "description": "min=0.686, mean=0.686, max=0.686, sum=0.686 (1)", + "tab": "Accuracy", + "GPQA - Observed inference time (s)": { + "description": "min=12.414, mean=12.414, max=12.414, sum=12.414 (1)", + "tab": "Efficiency", + "score": 12.414452127318263 + }, + "GPQA - # eval": { + "description": "min=446, mean=446, max=446, sum=446 (1)", + "tab": "General information", + "score": 446.0 + }, + "GPQA - # train": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "GPQA - truncated": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "GPQA - # prompt tokens": { + "description": "min=272.738, mean=272.738, max=272.738, sum=272.738 (1)", + "tab": "General information", + "score": 272.73766816143495 + }, + "GPQA - # output tokens": { + "description": "min=544.215, mean=544.215, max=544.215, sum=544.215 (1)", + "tab": "General information", + "score": 544.2152466367713 + } + } + }, + "generation_config": { + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } + } + }, + { + "evaluation_name": "IFEval", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "IFEval Strict Acc on IFEval", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.85, + "details": { + "description": "min=0.85, mean=0.85, max=0.85, sum=0.85 (1)", + "tab": "Accuracy", + "IFEval - Observed inference time (s)": { + "description": "min=10.904, mean=10.904, max=10.904, sum=10.904 (1)", + "tab": "Efficiency", + "score": 10.90394415211986 + }, + "IFEval - # eval": { + "description": "min=541, mean=541, max=541, sum=541 (1)", + "tab": "General information", + "score": 541.0 + }, + "IFEval - # train": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "IFEval - truncated": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "IFEval - # prompt tokens": { + "description": "min=47.159, mean=47.159, max=47.159, sum=47.159 (1)", + "tab": "General information", + "score": 47.15896487985213 + }, + "IFEval - # output tokens": { + "description": "min=414.632, mean=414.632, max=414.632, sum=414.632 (1)", + "tab": "General information", + "score": 414.63216266173754 + } + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "WildBench", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "WB Score on WildBench", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.854, + "details": { + "description": "min=0.854, mean=0.854, max=0.854, sum=0.854 (1)", + "tab": "Accuracy", + "WildBench - Observed inference time (s)": { + "description": "min=38.544, mean=38.544, max=38.544, sum=38.544 (1)", + "tab": "Efficiency", + "score": 38.54364204096484 + }, + "WildBench - # eval": { + "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", + "tab": "General information", + "score": 1000.0 + }, + "WildBench - # train": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "WildBench - truncated": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "WildBench - # prompt tokens": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "WildBench - # output tokens": { + "description": "min=1804.604, mean=1804.604, max=1804.604, sum=1804.604 (1)", + "tab": "General information", + "score": 1804.604 + } + } + }, + "generation_config": { + "additional_details": { + "subset": "v2" + } + } + }, + { + "evaluation_name": "Omni-MATH", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "Acc on Omni-MATH", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.553, + "details": { + "description": "min=0.553, mean=0.553, max=0.553, sum=0.553 (1)", + "tab": "Accuracy", + "Omni-MATH - Observed inference time (s)": { + "description": "min=16.79, mean=16.79, max=16.79, sum=16.79 (1)", + "tab": "Efficiency", + "score": 16.790386716127397 + }, + "Omni-MATH - # eval": { + "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", + "tab": "General information", + "score": 1000.0 + }, + "Omni-MATH - # train": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "Omni-MATH - truncated": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "Omni-MATH - # prompt tokens": { + "description": "min=110.563, mean=110.563, max=110.563, sum=110.563 (1)", + "tab": "General information", + "score": 110.563 + }, + "Omni-MATH - # output tokens": { + "description": "min=892.774, mean=892.774, max=892.774, sum=892.774 (1)", + "tab": "General information", + "score": 892.774 + } + } + }, + "generation_config": { + "additional_details": {} + } + } + ] +} \ No newline at end of file diff --git a/data/helm_capabilities/deepseek-ai/deepseek-r1-0528/fcc025bc-98aa-44ef-b64d-a45a8e4daaa8.json b/data/helm_capabilities/deepseek-ai/deepseek-r1-0528/0d9a856d-01bf-4a82-9872-33d561cf4a57.json similarity index 76% rename from data/helm_capabilities/deepseek-ai/deepseek-r1-0528/fcc025bc-98aa-44ef-b64d-a45a8e4daaa8.json rename to data/helm_capabilities/deepseek-ai/deepseek-r1-0528/0d9a856d-01bf-4a82-9872-33d561cf4a57.json index 0b36b4b41..682cc94cc 100644 --- a/data/helm_capabilities/deepseek-ai/deepseek-r1-0528/fcc025bc-98aa-44ef-b64d-a45a8e4daaa8.json +++ b/data/helm_capabilities/deepseek-ai/deepseek-r1-0528/0d9a856d-01bf-4a82-9872-33d561cf4a57.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/deepseek-ai_deepseek-r1-0528/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/deepseek-ai_deepseek-r1-0528/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.699, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,12 +42,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -88,15 +100,24 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -140,15 +161,24 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -191,12 +221,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -240,13 +279,22 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/deepseek-ai/deepseek-v3/d031935b-2b54-4940-a852-dad1f10fc396.json b/data/helm_capabilities/deepseek-ai/deepseek-v3/3ff2ab7d-2c0f-4313-8223-8f514fde595a.json similarity index 76% rename from data/helm_capabilities/deepseek-ai/deepseek-v3/d031935b-2b54-4940-a852-dad1f10fc396.json rename to data/helm_capabilities/deepseek-ai/deepseek-v3/3ff2ab7d-2c0f-4313-8223-8f514fde595a.json index 3502a2f83..3b034de70 100644 --- a/data/helm_capabilities/deepseek-ai/deepseek-v3/d031935b-2b54-4940-a852-dad1f10fc396.json +++ b/data/helm_capabilities/deepseek-ai/deepseek-v3/3ff2ab7d-2c0f-4313-8223-8f514fde595a.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/deepseek-ai_deepseek-v3/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/deepseek-ai_deepseek-v3/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.665, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,12 +42,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -88,15 +100,24 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -140,15 +161,24 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -191,12 +221,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -240,13 +279,22 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/google/gemini-1.5-flash-002/b79010aa-d441-4850-b656-52ce6587dab8.json b/data/helm_capabilities/google/gemini-1.5-flash-002/2a46e8da-1996-428c-b567-cd0287b29d9f.json similarity index 75% rename from data/helm_capabilities/google/gemini-1.5-flash-002/b79010aa-d441-4850-b656-52ce6587dab8.json rename to data/helm_capabilities/google/gemini-1.5-flash-002/2a46e8da-1996-428c-b567-cd0287b29d9f.json index 9cecc3e6e..7d4281de4 100644 --- a/data/helm_capabilities/google/gemini-1.5-flash-002/b79010aa-d441-4850-b656-52ce6587dab8.json +++ b/data/helm_capabilities/google/gemini-1.5-flash-002/2a46e8da-1996-428c-b567-cd0287b29d9f.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/google_gemini-1.5-flash-002/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/google_gemini-1.5-flash-002/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Gemini 1.5 Flash (002)", + "name": "Gemini 1.5 Flash 002", "id": "google/gemini-1.5-flash-002", "developer": "google", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.609, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,12 +42,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -88,15 +100,24 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -140,15 +161,24 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -191,12 +221,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -240,13 +279,22 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/google/gemini-1.5-pro-002/dde5a36d-f14b-482d-86db-74bdb3771e38.json b/data/helm_capabilities/google/gemini-1.5-pro-002/30a92593-398e-4c2f-8be7-455be166aeaf.json similarity index 75% rename from data/helm_capabilities/google/gemini-1.5-pro-002/dde5a36d-f14b-482d-86db-74bdb3771e38.json rename to data/helm_capabilities/google/gemini-1.5-pro-002/30a92593-398e-4c2f-8be7-455be166aeaf.json index c41c3cf10..3c438fd59 100644 --- a/data/helm_capabilities/google/gemini-1.5-pro-002/dde5a36d-f14b-482d-86db-74bdb3771e38.json +++ b/data/helm_capabilities/google/gemini-1.5-pro-002/30a92593-398e-4c2f-8be7-455be166aeaf.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/google_gemini-1.5-pro-002/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/google_gemini-1.5-pro-002/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Gemini 1.5 Pro (002)", + "name": "Gemini 1.5 Pro 002", "id": "google/gemini-1.5-pro-002", "developer": "google", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.657, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,12 +42,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -88,15 +100,24 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -140,15 +161,24 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -191,12 +221,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -240,13 +279,22 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/google/gemini-2.0-flash-001/981ba423-a1d2-4577-9f61-9c4b8b430b58.json b/data/helm_capabilities/google/gemini-2.0-flash-001/e6fd55e0-6ff0-48f1-8b51-5f4372edb457.json similarity index 76% rename from data/helm_capabilities/google/gemini-2.0-flash-001/981ba423-a1d2-4577-9f61-9c4b8b430b58.json rename to data/helm_capabilities/google/gemini-2.0-flash-001/e6fd55e0-6ff0-48f1-8b51-5f4372edb457.json index 963d02bef..7f589b967 100644 --- a/data/helm_capabilities/google/gemini-2.0-flash-001/981ba423-a1d2-4577-9f61-9c4b8b430b58.json +++ b/data/helm_capabilities/google/gemini-2.0-flash-001/e6fd55e0-6ff0-48f1-8b51-5f4372edb457.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/google_gemini-2.0-flash-001/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/google_gemini-2.0-flash-001/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.679, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,12 +42,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -88,15 +100,24 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -140,15 +161,24 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -191,12 +221,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -240,13 +279,22 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/google/gemini-2.0-flash-lite-preview-02-05/56ddcce9-fc1c-476f-96c8-65a7d732c95b.json b/data/helm_capabilities/google/gemini-2.0-flash-lite-preview-02-05/dfc2717d-ead8-4287-885e-5e0fc09c35e3.json similarity index 75% rename from data/helm_capabilities/google/gemini-2.0-flash-lite-preview-02-05/56ddcce9-fc1c-476f-96c8-65a7d732c95b.json rename to data/helm_capabilities/google/gemini-2.0-flash-lite-preview-02-05/dfc2717d-ead8-4287-885e-5e0fc09c35e3.json index 87e886284..0376cdf40 100644 --- a/data/helm_capabilities/google/gemini-2.0-flash-lite-preview-02-05/56ddcce9-fc1c-476f-96c8-65a7d732c95b.json +++ b/data/helm_capabilities/google/gemini-2.0-flash-lite-preview-02-05/dfc2717d-ead8-4287-885e-5e0fc09c35e3.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/google_gemini-2.0-flash-lite-preview-02-05/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/google_gemini-2.0-flash-lite-preview-02-05/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Gemini 2.0 Flash Lite (02-05 preview)", + "name": "Gemini 2.0 Flash Lite 02-05 preview", "id": "google/gemini-2.0-flash-lite-preview-02-05", "developer": "google", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.642, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,12 +42,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -88,15 +100,24 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -140,15 +161,24 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -191,12 +221,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -240,13 +279,22 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/google/gemini-2.5-flash-lite/22da4909-8b3b-49f3-940f-8764509725f8.json b/data/helm_capabilities/google/gemini-2.5-flash-lite/e97292eb-7031-4a3a-a415-44c137898e3f.json similarity index 76% rename from data/helm_capabilities/google/gemini-2.5-flash-lite/22da4909-8b3b-49f3-940f-8764509725f8.json rename to data/helm_capabilities/google/gemini-2.5-flash-lite/e97292eb-7031-4a3a-a415-44c137898e3f.json index a5294b486..600681fbb 100644 --- a/data/helm_capabilities/google/gemini-2.5-flash-lite/22da4909-8b3b-49f3-940f-8764509725f8.json +++ b/data/helm_capabilities/google/gemini-2.5-flash-lite/e97292eb-7031-4a3a-a415-44c137898e3f.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/google_gemini-2.5-flash-lite/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/google_gemini-2.5-flash-lite/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.591, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,12 +42,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -88,15 +100,24 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -140,15 +161,24 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -191,12 +221,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -240,13 +279,22 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/google/gemini-2.5-flash-preview-04-17/a6b3d596-d204-4cb7-a3e4-4e717537b76a.json b/data/helm_capabilities/google/gemini-2.5-flash-preview-04-17/4263a6be-9640-40a1-8881-768624949d47.json similarity index 75% rename from data/helm_capabilities/google/gemini-2.5-flash-preview-04-17/a6b3d596-d204-4cb7-a3e4-4e717537b76a.json rename to data/helm_capabilities/google/gemini-2.5-flash-preview-04-17/4263a6be-9640-40a1-8881-768624949d47.json index d0e1ed757..221dc7a91 100644 --- a/data/helm_capabilities/google/gemini-2.5-flash-preview-04-17/a6b3d596-d204-4cb7-a3e4-4e717537b76a.json +++ b/data/helm_capabilities/google/gemini-2.5-flash-preview-04-17/4263a6be-9640-40a1-8881-768624949d47.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/google_gemini-2.5-flash-preview-04-17/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/google_gemini-2.5-flash-preview-04-17/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Gemini 2.5 Flash (04-17 preview)", + "name": "Gemini 2.5 Flash 04-17 preview", "id": "google/gemini-2.5-flash-preview-04-17", "developer": "google", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.626, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,12 +42,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -88,15 +100,24 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -140,15 +161,24 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -191,12 +221,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -240,13 +279,22 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/google/gemini-2.5-pro-preview-03-25/eaa18be0-1195-4344-9673-efa8c555456d.json b/data/helm_capabilities/google/gemini-2.5-pro-preview-03-25/a808cecf-8925-428f-99ea-b6c2f8bce96e.json similarity index 75% rename from data/helm_capabilities/google/gemini-2.5-pro-preview-03-25/eaa18be0-1195-4344-9673-efa8c555456d.json rename to data/helm_capabilities/google/gemini-2.5-pro-preview-03-25/a808cecf-8925-428f-99ea-b6c2f8bce96e.json index f1093c814..355cd3bc1 100644 --- a/data/helm_capabilities/google/gemini-2.5-pro-preview-03-25/eaa18be0-1195-4344-9673-efa8c555456d.json +++ b/data/helm_capabilities/google/gemini-2.5-pro-preview-03-25/a808cecf-8925-428f-99ea-b6c2f8bce96e.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/google_gemini-2.5-pro-preview-03-25/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/google_gemini-2.5-pro-preview-03-25/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Gemini 2.5 Pro (03-25 preview)", + "name": "Gemini 2.5 Pro 03-25 preview", "id": "google/gemini-2.5-pro-preview-03-25", "developer": "google", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.745, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,12 +42,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -88,15 +100,24 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -140,15 +161,24 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -191,12 +221,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -240,13 +279,22 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/google/gemini-3-pro-preview/55e44a3b-1fac-4ad5-b25e-85702f33883d.json b/data/helm_capabilities/google/gemini-3-pro-preview/55e44a3b-1fac-4ad5-b25e-85702f33883d.json new file mode 100644 index 000000000..d3ecb3ebb --- /dev/null +++ b/data/helm_capabilities/google/gemini-3-pro-preview/55e44a3b-1fac-4ad5-b25e-85702f33883d.json @@ -0,0 +1,345 @@ +{ + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/google_gemini-3-pro-preview/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", + "source_metadata": { + "source_name": "helm_capabilities", + "source_type": "documentation", + "source_organization_name": "crfm", + "evaluator_relationship": "third_party" + }, + "model_info": { + "name": "Gemini 3 Pro Preview", + "id": "google/gemini-3-pro-preview", + "developer": "google", + "inference_platform": "unknown" + }, + "evaluation_results": [ + { + "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "The mean of the scores from all columns.", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.799, + "details": { + "tab": "Accuracy", + "Mean score - Efficiency": { + "description": null, + "tab": "Efficiency", + "score": 50.969324812798575 + } + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "MMLU-Pro", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "COT correct on MMLU-Pro", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.903, + "details": { + "description": "min=0.903, mean=0.903, max=0.903, sum=0.903 (1)", + "tab": "Accuracy", + "MMLU-Pro - Observed inference time (s)": { + "description": "min=34.903, mean=34.903, max=34.903, sum=34.903 (1)", + "tab": "Efficiency", + "score": 34.903078527212145 + }, + "MMLU-Pro - # eval": { + "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", + "tab": "General information", + "score": 1000.0 + }, + "MMLU-Pro - # train": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "MMLU-Pro - truncated": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "MMLU-Pro - # prompt tokens": { + "description": "min=263.673, mean=263.673, max=263.673, sum=263.673 (1)", + "tab": "General information", + "score": 263.673 + }, + "MMLU-Pro - # output tokens": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + } + } + }, + "generation_config": { + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } + } + }, + { + "evaluation_name": "GPQA", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "COT correct on GPQA", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.803, + "details": { + "description": "min=0.803, mean=0.803, max=0.803, sum=0.803 (1)", + "tab": "Accuracy", + "GPQA - Observed inference time (s)": { + "description": "min=69.164, mean=69.164, max=69.164, sum=69.164 (1)", + "tab": "Efficiency", + "score": 69.16407415364355 + }, + "GPQA - # eval": { + "description": "min=446, mean=446, max=446, sum=446 (1)", + "tab": "General information", + "score": 446.0 + }, + "GPQA - # train": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "GPQA - truncated": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "GPQA - # prompt tokens": { + "description": "min=273.735, mean=273.735, max=273.735, sum=273.735 (1)", + "tab": "General information", + "score": 273.7354260089686 + }, + "GPQA - # output tokens": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + } + } + }, + "generation_config": { + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } + } + }, + { + "evaluation_name": "IFEval", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "IFEval Strict Acc on IFEval", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.876, + "details": { + "description": "min=0.876, mean=0.876, max=0.876, sum=0.876 (1)", + "tab": "Accuracy", + "IFEval - Observed inference time (s)": { + "description": "min=18.201, mean=18.201, max=18.201, sum=18.201 (1)", + "tab": "Efficiency", + "score": 18.200553727458452 + }, + "IFEval - # eval": { + "description": "min=541, mean=541, max=541, sum=541 (1)", + "tab": "General information", + "score": 541.0 + }, + "IFEval - # train": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "IFEval - truncated": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "IFEval - # prompt tokens": { + "description": "min=47.331, mean=47.331, max=47.331, sum=47.331 (1)", + "tab": "General information", + "score": 47.33086876155268 + }, + "IFEval - # output tokens": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + } + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "WildBench", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "WB Score on WildBench", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.859, + "details": { + "description": "min=0.859, mean=0.859, max=0.859, sum=0.859 (1)", + "tab": "Accuracy", + "WildBench - Observed inference time (s)": { + "description": "min=37.094, mean=37.094, max=37.094, sum=37.094 (1)", + "tab": "Efficiency", + "score": 37.09404513451669 + }, + "WildBench - # eval": { + "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", + "tab": "General information", + "score": 1000.0 + }, + "WildBench - # train": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "WildBench - truncated": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "WildBench - # prompt tokens": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "WildBench - # output tokens": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + } + } + }, + "generation_config": { + "additional_details": { + "subset": "v2" + } + } + }, + { + "evaluation_name": "Omni-MATH", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "Acc on Omni-MATH", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.555, + "details": { + "description": "min=0.555, mean=0.555, max=0.555, sum=0.555 (1)", + "tab": "Accuracy", + "Omni-MATH - Observed inference time (s)": { + "description": "min=95.485, mean=95.485, max=95.485, sum=95.485 (1)", + "tab": "Efficiency", + "score": 95.48487252116203 + }, + "Omni-MATH - # eval": { + "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", + "tab": "General information", + "score": 1000.0 + }, + "Omni-MATH - # train": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "Omni-MATH - truncated": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "Omni-MATH - # prompt tokens": { + "description": "min=111.956, mean=111.956, max=111.956, sum=111.956 (1)", + "tab": "General information", + "score": 111.956 + }, + "Omni-MATH - # output tokens": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + } + } + }, + "generation_config": { + "additional_details": {} + } + } + ] +} \ No newline at end of file diff --git a/data/helm_capabilities/ibm/granite-3.3-8b-instruct/0ae30d3c-098c-434f-985b-58e8179148a6.json b/data/helm_capabilities/ibm/granite-3.3-8b-instruct/5b5b339b-7631-4b77-ac51-df49d3e946eb.json similarity index 76% rename from data/helm_capabilities/ibm/granite-3.3-8b-instruct/0ae30d3c-098c-434f-985b-58e8179148a6.json rename to data/helm_capabilities/ibm/granite-3.3-8b-instruct/5b5b339b-7631-4b77-ac51-df49d3e946eb.json index 42be38419..869902b9d 100644 --- a/data/helm_capabilities/ibm/granite-3.3-8b-instruct/0ae30d3c-098c-434f-985b-58e8179148a6.json +++ b/data/helm_capabilities/ibm/granite-3.3-8b-instruct/5b5b339b-7631-4b77-ac51-df49d3e946eb.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/ibm_granite-3.3-8b-instruct/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/ibm_granite-3.3-8b-instruct/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.463, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,12 +42,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -88,15 +100,24 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -140,15 +161,24 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -191,12 +221,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -240,13 +279,22 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/ibm/granite-4.0-h-small/eaec6d66-6da7-4592-baca-2539240acc5d.json b/data/helm_capabilities/ibm/granite-4.0-h-small/eaec6d66-6da7-4592-baca-2539240acc5d.json new file mode 100644 index 000000000..03bc0f0f8 --- /dev/null +++ b/data/helm_capabilities/ibm/granite-4.0-h-small/eaec6d66-6da7-4592-baca-2539240acc5d.json @@ -0,0 +1,345 @@ +{ + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/ibm_granite-4.0-h-small/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", + "source_metadata": { + "source_name": "helm_capabilities", + "source_type": "documentation", + "source_organization_name": "crfm", + "evaluator_relationship": "third_party" + }, + "model_info": { + "name": "IBM Granite 4.0 Small", + "id": "ibm/granite-4.0-h-small", + "developer": "ibm", + "inference_platform": "unknown" + }, + "evaluation_results": [ + { + "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "The mean of the scores from all columns.", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.575, + "details": { + "tab": "Accuracy", + "Mean score - Efficiency": { + "description": null, + "tab": "Efficiency", + "score": 21.31162992088884 + } + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "MMLU-Pro", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "COT correct on MMLU-Pro", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.569, + "details": { + "description": "min=0.569, mean=0.569, max=0.569, sum=0.569 (1)", + "tab": "Accuracy", + "MMLU-Pro - Observed inference time (s)": { + "description": "min=12.071, mean=12.071, max=12.071, sum=12.071 (1)", + "tab": "Efficiency", + "score": 12.070928404092788 + }, + "MMLU-Pro - # eval": { + "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", + "tab": "General information", + "score": 1000.0 + }, + "MMLU-Pro - # train": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "MMLU-Pro - truncated": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "MMLU-Pro - # prompt tokens": { + "description": "min=288.391, mean=288.391, max=288.391, sum=288.391 (1)", + "tab": "General information", + "score": 288.391 + }, + "MMLU-Pro - # output tokens": { + "description": "min=372.93, mean=372.93, max=372.93, sum=372.93 (1)", + "tab": "General information", + "score": 372.93 + } + } + }, + "generation_config": { + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } + } + }, + { + "evaluation_name": "GPQA", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "COT correct on GPQA", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.383, + "details": { + "description": "min=0.383, mean=0.383, max=0.383, sum=0.383 (1)", + "tab": "Accuracy", + "GPQA - Observed inference time (s)": { + "description": "min=17.606, mean=17.606, max=17.606, sum=17.606 (1)", + "tab": "Efficiency", + "score": 17.606201725690354 + }, + "GPQA - # eval": { + "description": "min=446, mean=446, max=446, sum=446 (1)", + "tab": "General information", + "score": 446.0 + }, + "GPQA - # train": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "GPQA - truncated": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "GPQA - # prompt tokens": { + "description": "min=303.265, mean=303.265, max=303.265, sum=303.265 (1)", + "tab": "General information", + "score": 303.2645739910314 + }, + "GPQA - # output tokens": { + "description": "min=439.648, mean=439.648, max=439.648, sum=439.648 (1)", + "tab": "General information", + "score": 439.6479820627803 + } + } + }, + "generation_config": { + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } + } + }, + { + "evaluation_name": "IFEval", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "IFEval Strict Acc on IFEval", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.89, + "details": { + "description": "min=0.89, mean=0.89, max=0.89, sum=0.89 (1)", + "tab": "Accuracy", + "IFEval - Observed inference time (s)": { + "description": "min=13.366, mean=13.366, max=13.366, sum=13.366 (1)", + "tab": "Efficiency", + "score": 13.366226098453712 + }, + "IFEval - # eval": { + "description": "min=541, mean=541, max=541, sum=541 (1)", + "tab": "General information", + "score": 541.0 + }, + "IFEval - # train": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "IFEval - truncated": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "IFEval - # prompt tokens": { + "description": "min=51.534, mean=51.534, max=51.534, sum=51.534 (1)", + "tab": "General information", + "score": 51.53419593345656 + }, + "IFEval - # output tokens": { + "description": "min=494.717, mean=494.717, max=494.717, sum=494.717 (1)", + "tab": "General information", + "score": 494.7171903881701 + } + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "WildBench", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "WB Score on WildBench", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.739, + "details": { + "description": "min=0.739, mean=0.739, max=0.739, sum=0.739 (1)", + "tab": "Accuracy", + "WildBench - Observed inference time (s)": { + "description": "min=30.807, mean=30.807, max=30.807, sum=30.807 (1)", + "tab": "Efficiency", + "score": 30.80672695994377 + }, + "WildBench - # eval": { + "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", + "tab": "General information", + "score": 1000.0 + }, + "WildBench - # train": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "WildBench - truncated": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "WildBench - # prompt tokens": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "WildBench - # output tokens": { + "description": "min=996.159, mean=996.159, max=996.159, sum=996.159 (1)", + "tab": "General information", + "score": 996.159 + } + } + }, + "generation_config": { + "additional_details": { + "subset": "v2" + } + } + }, + { + "evaluation_name": "Omni-MATH", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "Acc on Omni-MATH", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.296, + "details": { + "description": "min=0.296, mean=0.296, max=0.296, sum=0.296 (1)", + "tab": "Accuracy", + "Omni-MATH - Observed inference time (s)": { + "description": "min=32.708, mean=32.708, max=32.708, sum=32.708 (1)", + "tab": "Efficiency", + "score": 32.70806641626358 + }, + "Omni-MATH - # eval": { + "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", + "tab": "General information", + "score": 1000.0 + }, + "Omni-MATH - # train": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "Omni-MATH - truncated": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "Omni-MATH - # prompt tokens": { + "description": "min=118.438, mean=118.438, max=118.438, sum=118.438 (1)", + "tab": "General information", + "score": 118.438 + }, + "Omni-MATH - # output tokens": { + "description": "min=1020.51, mean=1020.51, max=1020.51, sum=1020.51 (1)", + "tab": "General information", + "score": 1020.51 + } + } + }, + "generation_config": { + "additional_details": {} + } + } + ] +} \ No newline at end of file diff --git a/data/helm_capabilities/ibm/granite-4.0-micro/2db9cde5-4560-4ee4-8ffa-661dfc7db2f7.json b/data/helm_capabilities/ibm/granite-4.0-micro/2db9cde5-4560-4ee4-8ffa-661dfc7db2f7.json new file mode 100644 index 000000000..399dbb1e3 --- /dev/null +++ b/data/helm_capabilities/ibm/granite-4.0-micro/2db9cde5-4560-4ee4-8ffa-661dfc7db2f7.json @@ -0,0 +1,345 @@ +{ + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/ibm_granite-4.0-micro/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", + "source_metadata": { + "source_name": "helm_capabilities", + "source_type": "documentation", + "source_organization_name": "crfm", + "evaluator_relationship": "third_party" + }, + "model_info": { + "name": "IBM Granite 4.0 Micro", + "id": "ibm/granite-4.0-micro", + "developer": "ibm", + "inference_platform": "unknown" + }, + "evaluation_results": [ + { + "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "The mean of the scores from all columns.", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.486, + "details": { + "tab": "Accuracy", + "Mean score - Efficiency": { + "description": null, + "tab": "Efficiency", + "score": 5.725128505637726 + } + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "MMLU-Pro", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "COT correct on MMLU-Pro", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.395, + "details": { + "description": "min=0.395, mean=0.395, max=0.395, sum=0.395 (1)", + "tab": "Accuracy", + "MMLU-Pro - Observed inference time (s)": { + "description": "min=3.135, mean=3.135, max=3.135, sum=3.135 (1)", + "tab": "Efficiency", + "score": 3.1348352246284485 + }, + "MMLU-Pro - # eval": { + "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", + "tab": "General information", + "score": 1000.0 + }, + "MMLU-Pro - # train": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "MMLU-Pro - truncated": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "MMLU-Pro - # prompt tokens": { + "description": "min=288.391, mean=288.391, max=288.391, sum=288.391 (1)", + "tab": "General information", + "score": 288.391 + }, + "MMLU-Pro - # output tokens": { + "description": "min=325.255, mean=325.255, max=325.255, sum=325.255 (1)", + "tab": "General information", + "score": 325.255 + } + } + }, + "generation_config": { + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } + } + }, + { + "evaluation_name": "GPQA", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "COT correct on GPQA", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.307, + "details": { + "description": "min=0.307, mean=0.307, max=0.307, sum=0.307 (1)", + "tab": "Accuracy", + "GPQA - Observed inference time (s)": { + "description": "min=3.075, mean=3.075, max=3.075, sum=3.075 (1)", + "tab": "Efficiency", + "score": 3.075281912970436 + }, + "GPQA - # eval": { + "description": "min=446, mean=446, max=446, sum=446 (1)", + "tab": "General information", + "score": 446.0 + }, + "GPQA - # train": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "GPQA - truncated": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "GPQA - # prompt tokens": { + "description": "min=303.265, mean=303.265, max=303.265, sum=303.265 (1)", + "tab": "General information", + "score": 303.2645739910314 + }, + "GPQA - # output tokens": { + "description": "min=337.417, mean=337.417, max=337.417, sum=337.417 (1)", + "tab": "General information", + "score": 337.4170403587444 + } + } + }, + "generation_config": { + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } + } + }, + { + "evaluation_name": "IFEval", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "IFEval Strict Acc on IFEval", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.849, + "details": { + "description": "min=0.849, mean=0.849, max=0.849, sum=0.849 (1)", + "tab": "Accuracy", + "IFEval - Observed inference time (s)": { + "description": "min=4.58, mean=4.58, max=4.58, sum=4.58 (1)", + "tab": "Efficiency", + "score": 4.580414981806785 + }, + "IFEval - # eval": { + "description": "min=541, mean=541, max=541, sum=541 (1)", + "tab": "General information", + "score": 541.0 + }, + "IFEval - # train": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "IFEval - truncated": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "IFEval - # prompt tokens": { + "description": "min=51.534, mean=51.534, max=51.534, sum=51.534 (1)", + "tab": "General information", + "score": 51.53419593345656 + }, + "IFEval - # output tokens": { + "description": "min=497.8, mean=497.8, max=497.8, sum=497.8 (1)", + "tab": "General information", + "score": 497.8003696857671 + } + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "WildBench", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "WB Score on WildBench", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.67, + "details": { + "description": "min=0.67, mean=0.67, max=0.67, sum=0.67 (1)", + "tab": "Accuracy", + "WildBench - Observed inference time (s)": { + "description": "min=8.161, mean=8.161, max=8.161, sum=8.161 (1)", + "tab": "Efficiency", + "score": 8.160923891305924 + }, + "WildBench - # eval": { + "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", + "tab": "General information", + "score": 1000.0 + }, + "WildBench - # train": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "WildBench - truncated": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "WildBench - # prompt tokens": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "WildBench - # output tokens": { + "description": "min=1037.706, mean=1037.706, max=1037.706, sum=1037.706 (1)", + "tab": "General information", + "score": 1037.706 + } + } + }, + "generation_config": { + "additional_details": { + "subset": "v2" + } + } + }, + { + "evaluation_name": "Omni-MATH", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "Acc on Omni-MATH", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.209, + "details": { + "description": "min=0.209, mean=0.209, max=0.209, sum=0.209 (1)", + "tab": "Accuracy", + "Omni-MATH - Observed inference time (s)": { + "description": "min=9.674, mean=9.674, max=9.674, sum=9.674 (1)", + "tab": "Efficiency", + "score": 9.674186517477036 + }, + "Omni-MATH - # eval": { + "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", + "tab": "General information", + "score": 1000.0 + }, + "Omni-MATH - # train": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "Omni-MATH - truncated": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "Omni-MATH - # prompt tokens": { + "description": "min=118.438, mean=118.438, max=118.438, sum=118.438 (1)", + "tab": "General information", + "score": 118.438 + }, + "Omni-MATH - # output tokens": { + "description": "min=1145.889, mean=1145.889, max=1145.889, sum=1145.889 (1)", + "tab": "General information", + "score": 1145.889 + } + } + }, + "generation_config": { + "additional_details": {} + } + } + ] +} \ No newline at end of file diff --git a/data/helm_capabilities/marin-community/marin-8b-instruct/cc90bae5-b964-4402-9edb-5427663f01fb.json b/data/helm_capabilities/marin-community/marin-8b-instruct/eccf77c4-6a65-40b9-9445-dd35dee7f7b8.json similarity index 75% rename from data/helm_capabilities/marin-community/marin-8b-instruct/cc90bae5-b964-4402-9edb-5427663f01fb.json rename to data/helm_capabilities/marin-community/marin-8b-instruct/eccf77c4-6a65-40b9-9445-dd35dee7f7b8.json index 3622da7c6..736686c13 100644 --- a/data/helm_capabilities/marin-community/marin-8b-instruct/cc90bae5-b964-4402-9edb-5427663f01fb.json +++ b/data/helm_capabilities/marin-community/marin-8b-instruct/eccf77c4-6a65-40b9-9445-dd35dee7f7b8.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/marin-community_marin-8b-instruct/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/marin-community_marin-8b-instruct/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.325, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,12 +42,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -88,16 +100,25 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false", - "num_output_tokens": "2048" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false", + "num_output_tokens": "2048" + } } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -141,16 +162,25 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false", - "num_output_tokens": "2048" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false", + "num_output_tokens": "2048" + } } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -194,13 +224,22 @@ } }, "generation_config": { - "num_output_tokens": "2048" + "additional_details": { + "num_output_tokens": "2048" + } } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -244,14 +283,23 @@ } }, "generation_config": { - "subset": "v2", - "num_output_tokens": "2048" + "additional_details": { + "subset": "v2", + "num_output_tokens": "2048" + } } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -295,7 +343,9 @@ } }, "generation_config": { - "num_output_tokens": "2048" + "additional_details": { + "num_output_tokens": "2048" + } } } ] diff --git a/data/helm_capabilities/meta/llama-3.1-405b-instruct-turbo/2e8f73ba-73d9-43c6-9a26-a3a5b5375e50.json b/data/helm_capabilities/meta/llama-3.1-405b-instruct-turbo/75b5943a-67be-4b2f-85da-a52533edc76f.json similarity index 76% rename from data/helm_capabilities/meta/llama-3.1-405b-instruct-turbo/2e8f73ba-73d9-43c6-9a26-a3a5b5375e50.json rename to data/helm_capabilities/meta/llama-3.1-405b-instruct-turbo/75b5943a-67be-4b2f-85da-a52533edc76f.json index 6e7a59864..4dd5465a5 100644 --- a/data/helm_capabilities/meta/llama-3.1-405b-instruct-turbo/2e8f73ba-73d9-43c6-9a26-a3a5b5375e50.json +++ b/data/helm_capabilities/meta/llama-3.1-405b-instruct-turbo/75b5943a-67be-4b2f-85da-a52533edc76f.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/meta_llama-3.1-405b-instruct-turbo/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/meta_llama-3.1-405b-instruct-turbo/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Llama 3.1 Instruct Turbo (405B)", + "name": "Llama 3.1 Instruct Turbo 405B", "id": "meta/llama-3.1-405b-instruct-turbo", "developer": "meta", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.618, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,12 +42,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -88,15 +100,24 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -140,15 +161,24 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -191,12 +221,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -240,13 +279,22 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/meta/llama-3.1-70b-instruct-turbo/9b7139aa-a3e4-496e-9fb8-5c64d15ea945.json b/data/helm_capabilities/meta/llama-3.1-70b-instruct-turbo/8bec35b7-271a-457d-b665-9f69baa248aa.json similarity index 76% rename from data/helm_capabilities/meta/llama-3.1-70b-instruct-turbo/9b7139aa-a3e4-496e-9fb8-5c64d15ea945.json rename to data/helm_capabilities/meta/llama-3.1-70b-instruct-turbo/8bec35b7-271a-457d-b665-9f69baa248aa.json index 9ba719da5..407242cbb 100644 --- a/data/helm_capabilities/meta/llama-3.1-70b-instruct-turbo/9b7139aa-a3e4-496e-9fb8-5c64d15ea945.json +++ b/data/helm_capabilities/meta/llama-3.1-70b-instruct-turbo/8bec35b7-271a-457d-b665-9f69baa248aa.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/meta_llama-3.1-70b-instruct-turbo/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/meta_llama-3.1-70b-instruct-turbo/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Llama 3.1 Instruct Turbo (70B)", + "name": "Llama 3.1 Instruct Turbo 70B", "id": "meta/llama-3.1-70b-instruct-turbo", "developer": "meta", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.574, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,12 +42,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -88,15 +100,24 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -140,15 +161,24 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -191,12 +221,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -240,13 +279,22 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/meta/llama-3.1-8b-instruct-turbo/930db2c4-d9c5-4e38-ae80-7304c2f10611.json b/data/helm_capabilities/meta/llama-3.1-8b-instruct-turbo/c4e5e54c-dfdc-4f61-8572-bff7fa028a61.json similarity index 76% rename from data/helm_capabilities/meta/llama-3.1-8b-instruct-turbo/930db2c4-d9c5-4e38-ae80-7304c2f10611.json rename to data/helm_capabilities/meta/llama-3.1-8b-instruct-turbo/c4e5e54c-dfdc-4f61-8572-bff7fa028a61.json index 4657892fd..30524d64b 100644 --- a/data/helm_capabilities/meta/llama-3.1-8b-instruct-turbo/930db2c4-d9c5-4e38-ae80-7304c2f10611.json +++ b/data/helm_capabilities/meta/llama-3.1-8b-instruct-turbo/c4e5e54c-dfdc-4f61-8572-bff7fa028a61.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/meta_llama-3.1-8b-instruct-turbo/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/meta_llama-3.1-8b-instruct-turbo/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Llama 3.1 Instruct Turbo (8B)", + "name": "Llama 3.1 Instruct Turbo 8B", "id": "meta/llama-3.1-8b-instruct-turbo", "developer": "meta", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.444, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,12 +42,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -88,15 +100,24 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -140,15 +161,24 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -191,12 +221,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -240,13 +279,22 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/meta/llama-4-maverick-17b-128e-instruct-fp8/226ce6f9-0cd6-469b-bf8a-f0c322b7f750.json b/data/helm_capabilities/meta/llama-4-maverick-17b-128e-instruct-fp8/c308b0a5-4c44-4369-9b23-8664959aa927.json similarity index 75% rename from data/helm_capabilities/meta/llama-4-maverick-17b-128e-instruct-fp8/226ce6f9-0cd6-469b-bf8a-f0c322b7f750.json rename to data/helm_capabilities/meta/llama-4-maverick-17b-128e-instruct-fp8/c308b0a5-4c44-4369-9b23-8664959aa927.json index 9c2141acc..d9ca75120 100644 --- a/data/helm_capabilities/meta/llama-4-maverick-17b-128e-instruct-fp8/226ce6f9-0cd6-469b-bf8a-f0c322b7f750.json +++ b/data/helm_capabilities/meta/llama-4-maverick-17b-128e-instruct-fp8/c308b0a5-4c44-4369-9b23-8664959aa927.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/meta_llama-4-maverick-17b-128e-instruct-fp8/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/meta_llama-4-maverick-17b-128e-instruct-fp8/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Llama 4 Maverick (17Bx128E) Instruct FP8", + "name": "Llama 4 Maverick 17Bx128E Instruct FP8", "id": "meta/llama-4-maverick-17b-128e-instruct-fp8", "developer": "meta", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.718, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,12 +42,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -88,15 +100,24 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -140,15 +161,24 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -191,12 +221,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -240,13 +279,22 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/meta/llama-4-scout-17b-16e-instruct/bb4e408d-505e-46c8-bd0c-7afa44a96498.json b/data/helm_capabilities/meta/llama-4-scout-17b-16e-instruct/1a1edfb2-f0f1-4930-82c0-99293ec76645.json similarity index 76% rename from data/helm_capabilities/meta/llama-4-scout-17b-16e-instruct/bb4e408d-505e-46c8-bd0c-7afa44a96498.json rename to data/helm_capabilities/meta/llama-4-scout-17b-16e-instruct/1a1edfb2-f0f1-4930-82c0-99293ec76645.json index 2d19156dc..640472423 100644 --- a/data/helm_capabilities/meta/llama-4-scout-17b-16e-instruct/bb4e408d-505e-46c8-bd0c-7afa44a96498.json +++ b/data/helm_capabilities/meta/llama-4-scout-17b-16e-instruct/1a1edfb2-f0f1-4930-82c0-99293ec76645.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/meta_llama-4-scout-17b-16e-instruct/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/meta_llama-4-scout-17b-16e-instruct/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Llama 4 Scout (17Bx16E) Instruct", + "name": "Llama 4 Scout 17Bx16E Instruct", "id": "meta/llama-4-scout-17b-16e-instruct", "developer": "meta", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.644, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,12 +42,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -88,15 +100,24 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -140,15 +161,24 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -191,12 +221,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -240,13 +279,22 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/mistralai/mistral-7b-instruct-v0.3/d63dad7a-f7b7-4c87-9712-3043fc117545.json b/data/helm_capabilities/mistralai/mistral-7b-instruct-v0.3/9aa5af51-8c55-4896-b634-162a9d82b58e.json similarity index 76% rename from data/helm_capabilities/mistralai/mistral-7b-instruct-v0.3/d63dad7a-f7b7-4c87-9712-3043fc117545.json rename to data/helm_capabilities/mistralai/mistral-7b-instruct-v0.3/9aa5af51-8c55-4896-b634-162a9d82b58e.json index 6663598e4..0b19a4ab4 100644 --- a/data/helm_capabilities/mistralai/mistral-7b-instruct-v0.3/d63dad7a-f7b7-4c87-9712-3043fc117545.json +++ b/data/helm_capabilities/mistralai/mistral-7b-instruct-v0.3/9aa5af51-8c55-4896-b634-162a9d82b58e.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/mistralai_mistral-7b-instruct-v0.3/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/mistralai_mistral-7b-instruct-v0.3/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Mistral Instruct v0.3 (7B)", + "name": "Mistral Instruct v0.3 7B", "id": "mistralai/mistral-7b-instruct-v0.3", "developer": "mistralai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.376, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,12 +42,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -88,15 +100,24 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -140,15 +161,24 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -191,12 +221,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -240,13 +279,22 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/mistralai/mistral-large-2411/7e7f739e-9363-4c41-871d-6cf6c4145728.json b/data/helm_capabilities/mistralai/mistral-large-2411/21461a52-2f25-48c9-be19-f9233317d817.json similarity index 76% rename from data/helm_capabilities/mistralai/mistral-large-2411/7e7f739e-9363-4c41-871d-6cf6c4145728.json rename to data/helm_capabilities/mistralai/mistral-large-2411/21461a52-2f25-48c9-be19-f9233317d817.json index db1fa9b82..dec52ca8a 100644 --- a/data/helm_capabilities/mistralai/mistral-large-2411/7e7f739e-9363-4c41-871d-6cf6c4145728.json +++ b/data/helm_capabilities/mistralai/mistral-large-2411/21461a52-2f25-48c9-be19-f9233317d817.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/mistralai_mistral-large-2411/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/mistralai_mistral-large-2411/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Mistral Large (2411)", + "name": "Mistral Large 2411", "id": "mistralai/mistral-large-2411", "developer": "mistralai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.598, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,12 +42,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -88,15 +100,24 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -140,15 +161,24 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -191,12 +221,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -240,13 +279,22 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/mistralai/mistral-small-2503/853d8802-1f0b-463e-b0e8-c98b4c6b60a8.json b/data/helm_capabilities/mistralai/mistral-small-2503/bdea0967-fcc7-493c-a18d-70727842deb9.json similarity index 76% rename from data/helm_capabilities/mistralai/mistral-small-2503/853d8802-1f0b-463e-b0e8-c98b4c6b60a8.json rename to data/helm_capabilities/mistralai/mistral-small-2503/bdea0967-fcc7-493c-a18d-70727842deb9.json index 69ce74931..7999b823d 100644 --- a/data/helm_capabilities/mistralai/mistral-small-2503/853d8802-1f0b-463e-b0e8-c98b4c6b60a8.json +++ b/data/helm_capabilities/mistralai/mistral-small-2503/bdea0967-fcc7-493c-a18d-70727842deb9.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/mistralai_mistral-small-2503/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/mistralai_mistral-small-2503/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Mistral Small 3.1 (2503)", + "name": "Mistral Small 3.1 2503", "id": "mistralai/mistral-small-2503", "developer": "mistralai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.558, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,12 +42,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -88,15 +100,24 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -140,15 +161,24 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -191,12 +221,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -240,13 +279,22 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/mistralai/mixtral-8x22b-instruct-v0.1/b05befca-44a5-45fb-823e-84bcc3ae81d0.json b/data/helm_capabilities/mistralai/mixtral-8x22b-instruct-v0.1/f7404ea3-62c7-47fc-9106-44c208470381.json similarity index 76% rename from data/helm_capabilities/mistralai/mixtral-8x22b-instruct-v0.1/b05befca-44a5-45fb-823e-84bcc3ae81d0.json rename to data/helm_capabilities/mistralai/mixtral-8x22b-instruct-v0.1/f7404ea3-62c7-47fc-9106-44c208470381.json index 2dfb94872..583f7956f 100644 --- a/data/helm_capabilities/mistralai/mixtral-8x22b-instruct-v0.1/b05befca-44a5-45fb-823e-84bcc3ae81d0.json +++ b/data/helm_capabilities/mistralai/mixtral-8x22b-instruct-v0.1/f7404ea3-62c7-47fc-9106-44c208470381.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/mistralai_mixtral-8x22b-instruct-v0.1/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/mistralai_mixtral-8x22b-instruct-v0.1/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Mixtral Instruct (8x22B)", + "name": "Mixtral Instruct 8x22B", "id": "mistralai/mixtral-8x22b-instruct-v0.1", "developer": "mistralai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.478, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,12 +42,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -88,15 +100,24 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -140,15 +161,24 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -191,12 +221,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -240,13 +279,22 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/mistralai/mixtral-8x7b-instruct-v0.1/2b1f7fa1-24df-4fb7-8255-d83992e32b8b.json b/data/helm_capabilities/mistralai/mixtral-8x7b-instruct-v0.1/2817820c-4b28-4235-a8fd-ad02d0f504bc.json similarity index 76% rename from data/helm_capabilities/mistralai/mixtral-8x7b-instruct-v0.1/2b1f7fa1-24df-4fb7-8255-d83992e32b8b.json rename to data/helm_capabilities/mistralai/mixtral-8x7b-instruct-v0.1/2817820c-4b28-4235-a8fd-ad02d0f504bc.json index 293d11168..d2c9cfb4e 100644 --- a/data/helm_capabilities/mistralai/mixtral-8x7b-instruct-v0.1/2b1f7fa1-24df-4fb7-8255-d83992e32b8b.json +++ b/data/helm_capabilities/mistralai/mixtral-8x7b-instruct-v0.1/2817820c-4b28-4235-a8fd-ad02d0f504bc.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/mistralai_mixtral-8x7b-instruct-v0.1/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/mistralai_mixtral-8x7b-instruct-v0.1/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Mixtral Instruct (8x7B)", + "name": "Mixtral Instruct 8x7B", "id": "mistralai/mixtral-8x7b-instruct-v0.1", "developer": "mistralai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.397, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,12 +42,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -88,15 +100,24 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -140,15 +161,24 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -191,12 +221,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -240,13 +279,22 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/moonshotai/kimi-k2-instruct/eaeab0d7-4418-4699-9774-bc1c6711b3d3.json b/data/helm_capabilities/moonshotai/kimi-k2-instruct/f3da71fc-fc88-4dda-b423-168d11eab317.json similarity index 76% rename from data/helm_capabilities/moonshotai/kimi-k2-instruct/eaeab0d7-4418-4699-9774-bc1c6711b3d3.json rename to data/helm_capabilities/moonshotai/kimi-k2-instruct/f3da71fc-fc88-4dda-b423-168d11eab317.json index 4c25e86d3..1946db617 100644 --- a/data/helm_capabilities/moonshotai/kimi-k2-instruct/eaeab0d7-4418-4699-9774-bc1c6711b3d3.json +++ b/data/helm_capabilities/moonshotai/kimi-k2-instruct/f3da71fc-fc88-4dda-b423-168d11eab317.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/moonshotai_kimi-k2-instruct/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/moonshotai_kimi-k2-instruct/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.768, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,12 +42,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -88,15 +100,24 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -140,15 +161,24 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -191,12 +221,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -240,13 +279,22 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/openai/gpt-4.1-2025-04-14/c58f0e5d-5fe4-4a94-a9a2-7835842482b8.json b/data/helm_capabilities/openai/gpt-4.1-2025-04-14/2f7c0db9-b5de-4674-a130-5315520dea68.json similarity index 76% rename from data/helm_capabilities/openai/gpt-4.1-2025-04-14/c58f0e5d-5fe4-4a94-a9a2-7835842482b8.json rename to data/helm_capabilities/openai/gpt-4.1-2025-04-14/2f7c0db9-b5de-4674-a130-5315520dea68.json index c005600e1..3c36cb01b 100644 --- a/data/helm_capabilities/openai/gpt-4.1-2025-04-14/c58f0e5d-5fe4-4a94-a9a2-7835842482b8.json +++ b/data/helm_capabilities/openai/gpt-4.1-2025-04-14/2f7c0db9-b5de-4674-a130-5315520dea68.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/openai_gpt-4.1-2025-04-14/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/openai_gpt-4.1-2025-04-14/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "GPT-4.1 (2025-04-14)", + "name": "GPT-4.1 2025-04-14", "id": "openai/gpt-4.1-2025-04-14", "developer": "openai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.727, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,12 +42,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -88,15 +100,24 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -140,15 +161,24 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -191,12 +221,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -240,13 +279,22 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/openai/gpt-4.1-mini-2025-04-14/acaf03fd-9d4b-4fe3-8ffe-88212a786363.json b/data/helm_capabilities/openai/gpt-4.1-mini-2025-04-14/4dcb8022-fe54-42f7-b43f-9866de173731.json similarity index 76% rename from data/helm_capabilities/openai/gpt-4.1-mini-2025-04-14/acaf03fd-9d4b-4fe3-8ffe-88212a786363.json rename to data/helm_capabilities/openai/gpt-4.1-mini-2025-04-14/4dcb8022-fe54-42f7-b43f-9866de173731.json index d6481e60a..dd4503511 100644 --- a/data/helm_capabilities/openai/gpt-4.1-mini-2025-04-14/acaf03fd-9d4b-4fe3-8ffe-88212a786363.json +++ b/data/helm_capabilities/openai/gpt-4.1-mini-2025-04-14/4dcb8022-fe54-42f7-b43f-9866de173731.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/openai_gpt-4.1-mini-2025-04-14/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/openai_gpt-4.1-mini-2025-04-14/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "GPT-4.1 mini (2025-04-14)", + "name": "GPT-4.1 mini 2025-04-14", "id": "openai/gpt-4.1-mini-2025-04-14", "developer": "openai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.726, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,12 +42,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -88,15 +100,24 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -140,15 +161,24 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -191,12 +221,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -240,13 +279,22 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/openai/gpt-4.1-nano-2025-04-14/308d3e1d-a1b9-4722-8333-23b840316e3d.json b/data/helm_capabilities/openai/gpt-4.1-nano-2025-04-14/c436f3d1-84ee-49df-9287-0305925f7cf4.json similarity index 76% rename from data/helm_capabilities/openai/gpt-4.1-nano-2025-04-14/308d3e1d-a1b9-4722-8333-23b840316e3d.json rename to data/helm_capabilities/openai/gpt-4.1-nano-2025-04-14/c436f3d1-84ee-49df-9287-0305925f7cf4.json index e878bf385..e2550958a 100644 --- a/data/helm_capabilities/openai/gpt-4.1-nano-2025-04-14/308d3e1d-a1b9-4722-8333-23b840316e3d.json +++ b/data/helm_capabilities/openai/gpt-4.1-nano-2025-04-14/c436f3d1-84ee-49df-9287-0305925f7cf4.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/openai_gpt-4.1-nano-2025-04-14/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/openai_gpt-4.1-nano-2025-04-14/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "GPT-4.1 nano (2025-04-14)", + "name": "GPT-4.1 nano 2025-04-14", "id": "openai/gpt-4.1-nano-2025-04-14", "developer": "openai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.616, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,12 +42,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -88,15 +100,24 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -140,15 +161,24 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -191,12 +221,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -240,13 +279,22 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/openai/gpt-4o-2024-11-20/84a942b6-2b77-4bc2-859f-6b8d6be93558.json b/data/helm_capabilities/openai/gpt-4o-2024-11-20/90ba0b16-b866-4b18-bd84-6a8cd1c47c47.json similarity index 76% rename from data/helm_capabilities/openai/gpt-4o-2024-11-20/84a942b6-2b77-4bc2-859f-6b8d6be93558.json rename to data/helm_capabilities/openai/gpt-4o-2024-11-20/90ba0b16-b866-4b18-bd84-6a8cd1c47c47.json index ae08e8732..3c3d40256 100644 --- a/data/helm_capabilities/openai/gpt-4o-2024-11-20/84a942b6-2b77-4bc2-859f-6b8d6be93558.json +++ b/data/helm_capabilities/openai/gpt-4o-2024-11-20/90ba0b16-b866-4b18-bd84-6a8cd1c47c47.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/openai_gpt-4o-2024-11-20/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/openai_gpt-4o-2024-11-20/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "GPT-4o (2024-11-20)", + "name": "GPT-4o 2024-11-20", "id": "openai/gpt-4o-2024-11-20", "developer": "openai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.634, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,12 +42,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -88,15 +100,24 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -140,15 +161,24 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -191,12 +221,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -240,13 +279,22 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/openai/gpt-4o-mini-2024-07-18/7e0e6ec7-1b72-4764-8fa4-f7646b4b93d3.json b/data/helm_capabilities/openai/gpt-4o-mini-2024-07-18/07c823ba-9e17-47e4-858b-a1f2a514a276.json similarity index 76% rename from data/helm_capabilities/openai/gpt-4o-mini-2024-07-18/7e0e6ec7-1b72-4764-8fa4-f7646b4b93d3.json rename to data/helm_capabilities/openai/gpt-4o-mini-2024-07-18/07c823ba-9e17-47e4-858b-a1f2a514a276.json index c3aeb8ab5..778449e6e 100644 --- a/data/helm_capabilities/openai/gpt-4o-mini-2024-07-18/7e0e6ec7-1b72-4764-8fa4-f7646b4b93d3.json +++ b/data/helm_capabilities/openai/gpt-4o-mini-2024-07-18/07c823ba-9e17-47e4-858b-a1f2a514a276.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/openai_gpt-4o-mini-2024-07-18/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/openai_gpt-4o-mini-2024-07-18/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "GPT-4o mini (2024-07-18)", + "name": "GPT-4o mini 2024-07-18", "id": "openai/gpt-4o-mini-2024-07-18", "developer": "openai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.565, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,12 +42,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -88,15 +100,24 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -140,15 +161,24 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -191,12 +221,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -240,13 +279,22 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/openai/gpt-5-2025-08-07/cb444c37-e273-4aaf-881e-8a433f630053.json b/data/helm_capabilities/openai/gpt-5-2025-08-07/eb1bb443-71ad-4b79-8308-2b66c5e8c631.json similarity index 76% rename from data/helm_capabilities/openai/gpt-5-2025-08-07/cb444c37-e273-4aaf-881e-8a433f630053.json rename to data/helm_capabilities/openai/gpt-5-2025-08-07/eb1bb443-71ad-4b79-8308-2b66c5e8c631.json index 2fd77c3d1..95d9762ef 100644 --- a/data/helm_capabilities/openai/gpt-5-2025-08-07/cb444c37-e273-4aaf-881e-8a433f630053.json +++ b/data/helm_capabilities/openai/gpt-5-2025-08-07/eb1bb443-71ad-4b79-8308-2b66c5e8c631.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/openai_gpt-5-2025-08-07/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/openai_gpt-5-2025-08-07/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "GPT-5 (2025-08-07)", + "name": "GPT-5 2025-08-07", "id": "openai/gpt-5-2025-08-07", "developer": "openai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.807, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,12 +42,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -88,15 +100,24 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -140,15 +161,24 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -191,12 +221,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -240,13 +279,22 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/openai/gpt-5-mini-2025-08-07/7af059e2-b56e-46ed-b699-63e570081f16.json b/data/helm_capabilities/openai/gpt-5-mini-2025-08-07/e14d42a9-9639-4c35-8a0c-e395e754c46c.json similarity index 76% rename from data/helm_capabilities/openai/gpt-5-mini-2025-08-07/7af059e2-b56e-46ed-b699-63e570081f16.json rename to data/helm_capabilities/openai/gpt-5-mini-2025-08-07/e14d42a9-9639-4c35-8a0c-e395e754c46c.json index cf4a0414b..5dc165206 100644 --- a/data/helm_capabilities/openai/gpt-5-mini-2025-08-07/7af059e2-b56e-46ed-b699-63e570081f16.json +++ b/data/helm_capabilities/openai/gpt-5-mini-2025-08-07/e14d42a9-9639-4c35-8a0c-e395e754c46c.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/openai_gpt-5-mini-2025-08-07/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/openai_gpt-5-mini-2025-08-07/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "GPT-5 mini (2025-08-07)", + "name": "GPT-5 mini 2025-08-07", "id": "openai/gpt-5-mini-2025-08-07", "developer": "openai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.819, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,12 +42,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -88,15 +100,24 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -140,15 +161,24 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -191,12 +221,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -240,13 +279,22 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/openai/gpt-5-nano-2025-08-07/2dc0b2e4-c412-4c83-8b7a-6ee778e4c421.json b/data/helm_capabilities/openai/gpt-5-nano-2025-08-07/3754df44-ddce-4a66-9074-f65f5677ae27.json similarity index 76% rename from data/helm_capabilities/openai/gpt-5-nano-2025-08-07/2dc0b2e4-c412-4c83-8b7a-6ee778e4c421.json rename to data/helm_capabilities/openai/gpt-5-nano-2025-08-07/3754df44-ddce-4a66-9074-f65f5677ae27.json index a9996e0cd..096518c62 100644 --- a/data/helm_capabilities/openai/gpt-5-nano-2025-08-07/2dc0b2e4-c412-4c83-8b7a-6ee778e4c421.json +++ b/data/helm_capabilities/openai/gpt-5-nano-2025-08-07/3754df44-ddce-4a66-9074-f65f5677ae27.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/openai_gpt-5-nano-2025-08-07/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/openai_gpt-5-nano-2025-08-07/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "GPT-5 nano (2025-08-07)", + "name": "GPT-5 nano 2025-08-07", "id": "openai/gpt-5-nano-2025-08-07", "developer": "openai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.748, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,12 +42,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -88,15 +100,24 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -140,15 +161,24 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -191,12 +221,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -240,13 +279,22 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/openai/gpt-5.1-2025-11-13/a540b282-e9d6-403e-96df-a1d27ad14d3a.json b/data/helm_capabilities/openai/gpt-5.1-2025-11-13/a540b282-e9d6-403e-96df-a1d27ad14d3a.json new file mode 100644 index 000000000..738007852 --- /dev/null +++ b/data/helm_capabilities/openai/gpt-5.1-2025-11-13/a540b282-e9d6-403e-96df-a1d27ad14d3a.json @@ -0,0 +1,345 @@ +{ + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/openai_gpt-5.1-2025-11-13/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", + "source_metadata": { + "source_name": "helm_capabilities", + "source_type": "documentation", + "source_organization_name": "crfm", + "evaluator_relationship": "third_party" + }, + "model_info": { + "name": "GPT-5.1 2025-11-13", + "id": "openai/gpt-5.1-2025-11-13", + "developer": "openai", + "inference_platform": "unknown" + }, + "evaluation_results": [ + { + "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "The mean of the scores from all columns.", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.656, + "details": { + "tab": "Accuracy", + "Mean score - Efficiency": { + "description": null, + "tab": "Efficiency", + "score": 10.620566227529599 + } + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "MMLU-Pro", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "COT correct on MMLU-Pro", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.579, + "details": { + "description": "min=0.579, mean=0.579, max=0.579, sum=0.579 (1)", + "tab": "Accuracy", + "MMLU-Pro - Observed inference time (s)": { + "description": "min=1.147, mean=1.147, max=1.147, sum=1.147 (1)", + "tab": "Efficiency", + "score": 1.1470122172832489 + }, + "MMLU-Pro - # eval": { + "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", + "tab": "General information", + "score": 1000.0 + }, + "MMLU-Pro - # train": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "MMLU-Pro - truncated": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "MMLU-Pro - # prompt tokens": { + "description": "min=248.569, mean=248.569, max=248.569, sum=248.569 (1)", + "tab": "General information", + "score": 248.569 + }, + "MMLU-Pro - # output tokens": { + "description": "min=5.002, mean=5.002, max=5.002, sum=5.002 (1)", + "tab": "General information", + "score": 5.002 + } + } + }, + "generation_config": { + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } + } + }, + { + "evaluation_name": "GPQA", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "COT correct on GPQA", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.442, + "details": { + "description": "min=0.442, mean=0.442, max=0.442, sum=0.442 (1)", + "tab": "Accuracy", + "GPQA - Observed inference time (s)": { + "description": "min=1.002, mean=1.002, max=1.002, sum=1.002 (1)", + "tab": "Efficiency", + "score": 1.002433323539426 + }, + "GPQA - # eval": { + "description": "min=446, mean=446, max=446, sum=446 (1)", + "tab": "General information", + "score": 446.0 + }, + "GPQA - # train": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "GPQA - truncated": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "GPQA - # prompt tokens": { + "description": "min=268.152, mean=268.152, max=268.152, sum=268.152 (1)", + "tab": "General information", + "score": 268.15246636771303 + }, + "GPQA - # output tokens": { + "description": "min=5.422, mean=5.422, max=5.422, sum=5.422 (1)", + "tab": "General information", + "score": 5.42152466367713 + } + } + }, + "generation_config": { + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } + } + }, + { + "evaluation_name": "IFEval", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "IFEval Strict Acc on IFEval", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.935, + "details": { + "description": "min=0.935, mean=0.935, max=0.935, sum=0.935 (1)", + "tab": "Accuracy", + "IFEval - Observed inference time (s)": { + "description": "min=13.159, mean=13.159, max=13.159, sum=13.159 (1)", + "tab": "Efficiency", + "score": 13.15882584436103 + }, + "IFEval - # eval": { + "description": "min=541, mean=541, max=541, sum=541 (1)", + "tab": "General information", + "score": 541.0 + }, + "IFEval - # train": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "IFEval - truncated": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "IFEval - # prompt tokens": { + "description": "min=45.671, mean=45.671, max=45.671, sum=45.671 (1)", + "tab": "General information", + "score": 45.67097966728281 + }, + "IFEval - # output tokens": { + "description": "min=647.063, mean=647.063, max=647.063, sum=647.063 (1)", + "tab": "General information", + "score": 647.0628465804067 + } + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "WildBench", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "WB Score on WildBench", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.863, + "details": { + "description": "min=0.863, mean=0.863, max=0.863, sum=0.863 (1)", + "tab": "Accuracy", + "WildBench - Observed inference time (s)": { + "description": "min=28.081, mean=28.081, max=28.081, sum=28.081 (1)", + "tab": "Efficiency", + "score": 28.08133857488632 + }, + "WildBench - # eval": { + "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", + "tab": "General information", + "score": 1000.0 + }, + "WildBench - # train": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "WildBench - truncated": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "WildBench - # prompt tokens": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "WildBench - # output tokens": { + "description": "min=2059.716, mean=2059.716, max=2059.716, sum=2059.716 (1)", + "tab": "General information", + "score": 2059.716 + } + } + }, + "generation_config": { + "additional_details": { + "subset": "v2" + } + } + }, + { + "evaluation_name": "Omni-MATH", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "Acc on Omni-MATH", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.464, + "details": { + "description": "min=0.464, mean=0.464, max=0.464, sum=0.464 (1)", + "tab": "Accuracy", + "Omni-MATH - Observed inference time (s)": { + "description": "min=9.713, mean=9.713, max=9.713, sum=9.713 (1)", + "tab": "Efficiency", + "score": 9.713221177577973 + }, + "Omni-MATH - # eval": { + "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", + "tab": "General information", + "score": 1000.0 + }, + "Omni-MATH - # train": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "Omni-MATH - truncated": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "Omni-MATH - # prompt tokens": { + "description": "min=109.623, mean=109.623, max=109.623, sum=109.623 (1)", + "tab": "General information", + "score": 109.623 + }, + "Omni-MATH - # output tokens": { + "description": "min=1256.266, mean=1256.266, max=1256.266, sum=1256.266 (1)", + "tab": "General information", + "score": 1256.266 + } + } + }, + "generation_config": { + "additional_details": {} + } + } + ] +} \ No newline at end of file diff --git a/data/helm_capabilities/openai/gpt-oss-120b/e9a85dec-b32a-4f7f-ad66-a4bdc314501e.json b/data/helm_capabilities/openai/gpt-oss-120b/758851b3-9ac9-43d8-8b6a-3d9688752d80.json similarity index 76% rename from data/helm_capabilities/openai/gpt-oss-120b/e9a85dec-b32a-4f7f-ad66-a4bdc314501e.json rename to data/helm_capabilities/openai/gpt-oss-120b/758851b3-9ac9-43d8-8b6a-3d9688752d80.json index 0b6f0418d..8642e9954 100644 --- a/data/helm_capabilities/openai/gpt-oss-120b/e9a85dec-b32a-4f7f-ad66-a4bdc314501e.json +++ b/data/helm_capabilities/openai/gpt-oss-120b/758851b3-9ac9-43d8-8b6a-3d9688752d80.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/openai_gpt-oss-120b/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/openai_gpt-oss-120b/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.77, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,12 +42,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -88,15 +100,24 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -140,15 +161,24 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -191,12 +221,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -240,13 +279,22 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/openai/gpt-oss-20b/acb07214-c0f3-4006-8a3b-23793891a1bf.json b/data/helm_capabilities/openai/gpt-oss-20b/1d9ac688-ca0d-405b-a262-e95673e79250.json similarity index 76% rename from data/helm_capabilities/openai/gpt-oss-20b/acb07214-c0f3-4006-8a3b-23793891a1bf.json rename to data/helm_capabilities/openai/gpt-oss-20b/1d9ac688-ca0d-405b-a262-e95673e79250.json index 36043d89a..5112d535f 100644 --- a/data/helm_capabilities/openai/gpt-oss-20b/acb07214-c0f3-4006-8a3b-23793891a1bf.json +++ b/data/helm_capabilities/openai/gpt-oss-20b/1d9ac688-ca0d-405b-a262-e95673e79250.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/openai_gpt-oss-20b/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/openai_gpt-oss-20b/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.674, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,12 +42,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -88,15 +100,24 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -140,15 +161,24 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -191,12 +221,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -240,13 +279,22 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/openai/o3-2025-04-16/a1c5d581-be98-4e1e-ba14-ca922bfac035.json b/data/helm_capabilities/openai/o3-2025-04-16/c1e593d9-08ba-40fe-b02f-1c95be8fdfc9.json similarity index 75% rename from data/helm_capabilities/openai/o3-2025-04-16/a1c5d581-be98-4e1e-ba14-ca922bfac035.json rename to data/helm_capabilities/openai/o3-2025-04-16/c1e593d9-08ba-40fe-b02f-1c95be8fdfc9.json index 2d017bb31..677721448 100644 --- a/data/helm_capabilities/openai/o3-2025-04-16/a1c5d581-be98-4e1e-ba14-ca922bfac035.json +++ b/data/helm_capabilities/openai/o3-2025-04-16/c1e593d9-08ba-40fe-b02f-1c95be8fdfc9.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/openai_o3-2025-04-16/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/openai_o3-2025-04-16/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "o3 (2025-04-16)", + "name": "o3 2025-04-16", "id": "openai/o3-2025-04-16", "developer": "openai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.811, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,12 +42,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -88,15 +100,24 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -140,15 +161,24 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -191,12 +221,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -240,13 +279,22 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/openai/o4-mini-2025-04-16/c7b6ae15-cfe1-4bbd-a4d1-d45465b74081.json b/data/helm_capabilities/openai/o4-mini-2025-04-16/35a31e19-2ef5-4caa-a848-422af42adab8.json similarity index 76% rename from data/helm_capabilities/openai/o4-mini-2025-04-16/c7b6ae15-cfe1-4bbd-a4d1-d45465b74081.json rename to data/helm_capabilities/openai/o4-mini-2025-04-16/35a31e19-2ef5-4caa-a848-422af42adab8.json index db654a7b8..fd4ae16c5 100644 --- a/data/helm_capabilities/openai/o4-mini-2025-04-16/c7b6ae15-cfe1-4bbd-a4d1-d45465b74081.json +++ b/data/helm_capabilities/openai/o4-mini-2025-04-16/35a31e19-2ef5-4caa-a848-422af42adab8.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/openai_o4-mini-2025-04-16/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/openai_o4-mini-2025-04-16/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "o4-mini (2025-04-16)", + "name": "o4-mini 2025-04-16", "id": "openai/o4-mini-2025-04-16", "developer": "openai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.812, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,12 +42,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -88,15 +100,24 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -140,15 +161,24 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -191,12 +221,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -240,13 +279,22 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/qwen/qwen2.5-72b-instruct-turbo/f6d74c93-0e96-4fc5-987c-18a79dbde17c.json b/data/helm_capabilities/qwen/qwen2.5-72b-instruct-turbo/7de0bda2-ce56-444a-b293-a310a5b2d7ab.json similarity index 76% rename from data/helm_capabilities/qwen/qwen2.5-72b-instruct-turbo/f6d74c93-0e96-4fc5-987c-18a79dbde17c.json rename to data/helm_capabilities/qwen/qwen2.5-72b-instruct-turbo/7de0bda2-ce56-444a-b293-a310a5b2d7ab.json index 7bc9ee7ae..50778c699 100644 --- a/data/helm_capabilities/qwen/qwen2.5-72b-instruct-turbo/f6d74c93-0e96-4fc5-987c-18a79dbde17c.json +++ b/data/helm_capabilities/qwen/qwen2.5-72b-instruct-turbo/7de0bda2-ce56-444a-b293-a310a5b2d7ab.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/qwen_qwen2.5-72b-instruct-turbo/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/qwen_qwen2.5-72b-instruct-turbo/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Qwen2.5 Instruct Turbo (72B)", + "name": "Qwen2.5 Instruct Turbo 72B", "id": "qwen/qwen2.5-72b-instruct-turbo", "developer": "qwen", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.599, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,12 +42,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -88,15 +100,24 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -140,15 +161,24 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -191,12 +221,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -240,13 +279,22 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/qwen/qwen2.5-7b-instruct-turbo/f96da103-5350-4b1b-b33e-6ced1f1f7815.json b/data/helm_capabilities/qwen/qwen2.5-7b-instruct-turbo/dd22f29b-f8b8-4c59-9f26-f6633bbbdc8b.json similarity index 76% rename from data/helm_capabilities/qwen/qwen2.5-7b-instruct-turbo/f96da103-5350-4b1b-b33e-6ced1f1f7815.json rename to data/helm_capabilities/qwen/qwen2.5-7b-instruct-turbo/dd22f29b-f8b8-4c59-9f26-f6633bbbdc8b.json index 921d79480..c974f1019 100644 --- a/data/helm_capabilities/qwen/qwen2.5-7b-instruct-turbo/f96da103-5350-4b1b-b33e-6ced1f1f7815.json +++ b/data/helm_capabilities/qwen/qwen2.5-7b-instruct-turbo/dd22f29b-f8b8-4c59-9f26-f6633bbbdc8b.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/qwen_qwen2.5-7b-instruct-turbo/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/qwen_qwen2.5-7b-instruct-turbo/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Qwen2.5 Instruct Turbo (7B)", + "name": "Qwen2.5 Instruct Turbo 7B", "id": "qwen/qwen2.5-7b-instruct-turbo", "developer": "qwen", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.529, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,12 +42,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -88,15 +100,24 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -140,15 +161,24 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -191,12 +221,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -240,13 +279,22 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/qwen/qwen3-235b-a22b-fp8-tput/27bae7f2-92dd-4feb-9050-2d11c6da2d61.json b/data/helm_capabilities/qwen/qwen3-235b-a22b-fp8-tput/9eb537b9-9e2d-4d0a-bfa5-644a18f4db0e.json similarity index 76% rename from data/helm_capabilities/qwen/qwen3-235b-a22b-fp8-tput/27bae7f2-92dd-4feb-9050-2d11c6da2d61.json rename to data/helm_capabilities/qwen/qwen3-235b-a22b-fp8-tput/9eb537b9-9e2d-4d0a-bfa5-644a18f4db0e.json index 7bc1c5881..9ded60c84 100644 --- a/data/helm_capabilities/qwen/qwen3-235b-a22b-fp8-tput/27bae7f2-92dd-4feb-9050-2d11c6da2d61.json +++ b/data/helm_capabilities/qwen/qwen3-235b-a22b-fp8-tput/9eb537b9-9e2d-4d0a-bfa5-644a18f4db0e.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/qwen_qwen3-235b-a22b-fp8-tput/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/qwen_qwen3-235b-a22b-fp8-tput/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.726, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,12 +42,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -88,15 +100,24 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -140,15 +161,24 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -191,12 +221,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -240,13 +279,22 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/qwen/qwen3-235b-a22b-instruct-2507-fp8/0524c7a5-aad2-41d9-b7fb-1d07f8f13846.json b/data/helm_capabilities/qwen/qwen3-235b-a22b-instruct-2507-fp8/07763926-3a19-43f9-a23f-095f6cb78799.json similarity index 76% rename from data/helm_capabilities/qwen/qwen3-235b-a22b-instruct-2507-fp8/0524c7a5-aad2-41d9-b7fb-1d07f8f13846.json rename to data/helm_capabilities/qwen/qwen3-235b-a22b-instruct-2507-fp8/07763926-3a19-43f9-a23f-095f6cb78799.json index 355119fa7..0210712c3 100644 --- a/data/helm_capabilities/qwen/qwen3-235b-a22b-instruct-2507-fp8/0524c7a5-aad2-41d9-b7fb-1d07f8f13846.json +++ b/data/helm_capabilities/qwen/qwen3-235b-a22b-instruct-2507-fp8/07763926-3a19-43f9-a23f-095f6cb78799.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/qwen_qwen3-235b-a22b-instruct-2507-fp8/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/qwen_qwen3-235b-a22b-instruct-2507-fp8/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.798, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,12 +42,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -88,15 +100,24 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -140,15 +161,24 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -191,12 +221,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -240,13 +279,22 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/qwen/qwen3-next-80b-a3b-thinking/56e024b3-c963-4172-9f52-7605276b3854.json b/data/helm_capabilities/qwen/qwen3-next-80b-a3b-thinking/56e024b3-c963-4172-9f52-7605276b3854.json new file mode 100644 index 000000000..6ee69548e --- /dev/null +++ b/data/helm_capabilities/qwen/qwen3-next-80b-a3b-thinking/56e024b3-c963-4172-9f52-7605276b3854.json @@ -0,0 +1,345 @@ +{ + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/qwen_qwen3-next-80b-a3b-thinking/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", + "source_metadata": { + "source_name": "helm_capabilities", + "source_type": "documentation", + "source_organization_name": "crfm", + "evaluator_relationship": "third_party" + }, + "model_info": { + "name": "Qwen3-Next 80B A3B Thinking", + "id": "qwen/qwen3-next-80b-a3b-thinking", + "developer": "qwen", + "inference_platform": "unknown" + }, + "evaluation_results": [ + { + "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "The mean of the scores from all columns.", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.7, + "details": { + "tab": "Accuracy", + "Mean score - Efficiency": { + "description": null, + "tab": "Efficiency", + "score": 27.61164260375731 + } + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "MMLU-Pro", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "COT correct on MMLU-Pro", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.786, + "details": { + "description": "min=0.786, mean=0.786, max=0.786, sum=0.786 (1)", + "tab": "Accuracy", + "MMLU-Pro - Observed inference time (s)": { + "description": "min=20.097, mean=20.097, max=20.097, sum=20.097 (1)", + "tab": "Efficiency", + "score": 20.09722422862053 + }, + "MMLU-Pro - # eval": { + "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", + "tab": "General information", + "score": 1000.0 + }, + "MMLU-Pro - # train": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "MMLU-Pro - truncated": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "MMLU-Pro - # prompt tokens": { + "description": "min=259.715, mean=259.715, max=259.715, sum=259.715 (1)", + "tab": "General information", + "score": 259.715 + }, + "MMLU-Pro - # output tokens": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + } + } + }, + "generation_config": { + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } + } + }, + { + "evaluation_name": "GPQA", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "COT correct on GPQA", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.63, + "details": { + "description": "min=0.63, mean=0.63, max=0.63, sum=0.63 (1)", + "tab": "Accuracy", + "GPQA - Observed inference time (s)": { + "description": "min=40.06, mean=40.06, max=40.06, sum=40.06 (1)", + "tab": "Efficiency", + "score": 40.06039341950096 + }, + "GPQA - # eval": { + "description": "min=446, mean=446, max=446, sum=446 (1)", + "tab": "General information", + "score": 446.0 + }, + "GPQA - # train": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "GPQA - truncated": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "GPQA - # prompt tokens": { + "description": "min=274.37, mean=274.37, max=274.37, sum=274.37 (1)", + "tab": "General information", + "score": 274.36995515695065 + }, + "GPQA - # output tokens": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + } + } + }, + "generation_config": { + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } + } + }, + { + "evaluation_name": "IFEval", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "IFEval Strict Acc on IFEval", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.81, + "details": { + "description": "min=0.81, mean=0.81, max=0.81, sum=0.81 (1)", + "tab": "Accuracy", + "IFEval - Observed inference time (s)": { + "description": "min=13.893, mean=13.893, max=13.893, sum=13.893 (1)", + "tab": "Efficiency", + "score": 13.89268838323639 + }, + "IFEval - # eval": { + "description": "min=541, mean=541, max=541, sum=541 (1)", + "tab": "General information", + "score": 541.0 + }, + "IFEval - # train": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "IFEval - truncated": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "IFEval - # prompt tokens": { + "description": "min=46.492, mean=46.492, max=46.492, sum=46.492 (1)", + "tab": "General information", + "score": 46.491682070240294 + }, + "IFEval - # output tokens": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + } + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "WildBench", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "WB Score on WildBench", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.807, + "details": { + "description": "min=0.807, mean=0.807, max=0.807, sum=0.807 (1)", + "tab": "Accuracy", + "WildBench - Observed inference time (s)": { + "description": "min=23.095, mean=23.095, max=23.095, sum=23.095 (1)", + "tab": "Efficiency", + "score": 23.095464605808257 + }, + "WildBench - # eval": { + "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", + "tab": "General information", + "score": 1000.0 + }, + "WildBench - # train": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "WildBench - truncated": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "WildBench - # prompt tokens": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "WildBench - # output tokens": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + } + } + }, + "generation_config": { + "additional_details": { + "subset": "v2" + } + } + }, + { + "evaluation_name": "Omni-MATH", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, + "metric_config": { + "evaluation_description": "Acc on Omni-MATH", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.467, + "details": { + "description": "min=0.467, mean=0.467, max=0.467, sum=0.467 (1)", + "tab": "Accuracy", + "Omni-MATH - Observed inference time (s)": { + "description": "min=40.912, mean=40.912, max=40.912, sum=40.912 (1)", + "tab": "Efficiency", + "score": 40.91244238162041 + }, + "Omni-MATH - # eval": { + "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", + "tab": "General information", + "score": 1000.0 + }, + "Omni-MATH - # train": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "Omni-MATH - truncated": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + }, + "Omni-MATH - # prompt tokens": { + "description": "min=111.6, mean=111.6, max=111.6, sum=111.6 (1)", + "tab": "General information", + "score": 111.6 + }, + "Omni-MATH - # output tokens": { + "description": "min=0, mean=0, max=0, sum=0 (1)", + "tab": "General information", + "score": 0.0 + } + } + }, + "generation_config": { + "additional_details": {} + } + } + ] +} \ No newline at end of file diff --git a/data/helm_capabilities/writer/palmyra-fin/39e948ed-a41e-4fde-aa25-9ceb84fdf0b9.json b/data/helm_capabilities/writer/palmyra-fin/6f660e47-1d86-473d-9864-208111dcea31.json similarity index 76% rename from data/helm_capabilities/writer/palmyra-fin/39e948ed-a41e-4fde-aa25-9ceb84fdf0b9.json rename to data/helm_capabilities/writer/palmyra-fin/6f660e47-1d86-473d-9864-208111dcea31.json index cf2b63d2e..b86fc5b45 100644 --- a/data/helm_capabilities/writer/palmyra-fin/39e948ed-a41e-4fde-aa25-9ceb84fdf0b9.json +++ b/data/helm_capabilities/writer/palmyra-fin/6f660e47-1d86-473d-9864-208111dcea31.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/writer_palmyra-fin/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/writer_palmyra-fin/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.577, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,12 +42,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -88,15 +100,24 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -140,15 +161,24 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -191,12 +221,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -240,13 +279,22 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/writer/palmyra-med/1d046894-4412-4e5a-a6e1-8b30d9dd7b57.json b/data/helm_capabilities/writer/palmyra-med/91ef1f96-a708-4c53-ac9d-208ef3420668.json similarity index 75% rename from data/helm_capabilities/writer/palmyra-med/1d046894-4412-4e5a-a6e1-8b30d9dd7b57.json rename to data/helm_capabilities/writer/palmyra-med/91ef1f96-a708-4c53-ac9d-208ef3420668.json index 0d8108574..ac68f722a 100644 --- a/data/helm_capabilities/writer/palmyra-med/1d046894-4412-4e5a-a6e1-8b30d9dd7b57.json +++ b/data/helm_capabilities/writer/palmyra-med/91ef1f96-a708-4c53-ac9d-208ef3420668.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/writer_palmyra-med/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/writer_palmyra-med/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.476, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,12 +42,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -88,15 +100,24 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -140,15 +161,24 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -191,12 +221,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -240,13 +279,22 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/writer/palmyra-x-004/01d3d6e7-5ca0-4ae8-8d03-b3c83c59fe6f.json b/data/helm_capabilities/writer/palmyra-x-004/c14bea74-0aa3-4dde-8ca1-cbc4ab3de1cc.json similarity index 76% rename from data/helm_capabilities/writer/palmyra-x-004/01d3d6e7-5ca0-4ae8-8d03-b3c83c59fe6f.json rename to data/helm_capabilities/writer/palmyra-x-004/c14bea74-0aa3-4dde-8ca1-cbc4ab3de1cc.json index 8b3240898..9398b6319 100644 --- a/data/helm_capabilities/writer/palmyra-x-004/01d3d6e7-5ca0-4ae8-8d03-b3c83c59fe6f.json +++ b/data/helm_capabilities/writer/palmyra-x-004/c14bea74-0aa3-4dde-8ca1-cbc4ab3de1cc.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/writer_palmyra-x-004/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/writer_palmyra-x-004/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.609, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,12 +42,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -88,15 +100,24 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -140,15 +161,24 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -191,12 +221,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -240,13 +279,22 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/writer/palmyra-x5/c79b1007-a4f9-45f0-945c-d9e9bef65d2d.json b/data/helm_capabilities/writer/palmyra-x5/505c6245-88d1-4557-9e34-63a4e8086210.json similarity index 75% rename from data/helm_capabilities/writer/palmyra-x5/c79b1007-a4f9-45f0-945c-d9e9bef65d2d.json rename to data/helm_capabilities/writer/palmyra-x5/505c6245-88d1-4557-9e34-63a4e8086210.json index ba834a256..6d3707107 100644 --- a/data/helm_capabilities/writer/palmyra-x5/c79b1007-a4f9-45f0-945c-d9e9bef65d2d.json +++ b/data/helm_capabilities/writer/palmyra-x5/505c6245-88d1-4557-9e34-63a4e8086210.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/writer_palmyra-x5/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/writer_palmyra-x5/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.696, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,12 +42,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -88,15 +100,24 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -140,15 +161,24 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -191,12 +221,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -240,13 +279,22 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/xai/grok-3-beta/24efb1b7-f34d-4ee8-8f90-deb8d44d24cd.json b/data/helm_capabilities/xai/grok-3-beta/9a473236-f187-4926-ae8a-e8b84fe2a060.json similarity index 76% rename from data/helm_capabilities/xai/grok-3-beta/24efb1b7-f34d-4ee8-8f90-deb8d44d24cd.json rename to data/helm_capabilities/xai/grok-3-beta/9a473236-f187-4926-ae8a-e8b84fe2a060.json index 7640dfe10..54503d043 100644 --- a/data/helm_capabilities/xai/grok-3-beta/24efb1b7-f34d-4ee8-8f90-deb8d44d24cd.json +++ b/data/helm_capabilities/xai/grok-3-beta/9a473236-f187-4926-ae8a-e8b84fe2a060.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/xai_grok-3-beta/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/xai_grok-3-beta/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.727, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,12 +42,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -88,15 +100,24 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -140,15 +161,24 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -191,12 +221,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -240,13 +279,22 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/xai/grok-3-mini-beta/b028eaaf-bc4d-4918-8464-f8c4b0c74973.json b/data/helm_capabilities/xai/grok-3-mini-beta/1d7ece9b-1dcf-4adf-aa16-b030e286c26e.json similarity index 76% rename from data/helm_capabilities/xai/grok-3-mini-beta/b028eaaf-bc4d-4918-8464-f8c4b0c74973.json rename to data/helm_capabilities/xai/grok-3-mini-beta/1d7ece9b-1dcf-4adf-aa16-b030e286c26e.json index 8570e4d80..a083c0183 100644 --- a/data/helm_capabilities/xai/grok-3-mini-beta/b028eaaf-bc4d-4918-8464-f8c4b0c74973.json +++ b/data/helm_capabilities/xai/grok-3-mini-beta/1d7ece9b-1dcf-4adf-aa16-b030e286c26e.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/xai_grok-3-mini-beta/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/xai_grok-3-mini-beta/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.679, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,12 +42,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -88,15 +100,24 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -140,15 +161,24 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -191,12 +221,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -240,13 +279,22 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/xai/grok-4-0709/c7d55b2e-64a2-4e1d-ae18-3f60b365866d.json b/data/helm_capabilities/xai/grok-4-0709/aeabfb59-74db-445c-9693-7a088ac5073c.json similarity index 75% rename from data/helm_capabilities/xai/grok-4-0709/c7d55b2e-64a2-4e1d-ae18-3f60b365866d.json rename to data/helm_capabilities/xai/grok-4-0709/aeabfb59-74db-445c-9693-7a088ac5073c.json index b9fbeb3c0..a25562cb1 100644 --- a/data/helm_capabilities/xai/grok-4-0709/c7d55b2e-64a2-4e1d-ae18-3f60b365866d.json +++ b/data/helm_capabilities/xai/grok-4-0709/aeabfb59-74db-445c-9693-7a088ac5073c.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/xai_grok-4-0709/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/xai_grok-4-0709/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Grok 4 (0709)", + "name": "Grok 4 0709", "id": "xai/grok-4-0709", "developer": "xai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.785, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,12 +42,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -88,15 +100,24 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -140,15 +161,24 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -191,12 +221,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -240,13 +279,22 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_capabilities/zai-org/glm-4.5-air-fp8/7b231b0d-89b8-4a0a-825e-ccfea212f565.json b/data/helm_capabilities/zai-org/glm-4.5-air-fp8/eb2f8883-30ee-42e1-95b5-48dcf988ecf5.json similarity index 76% rename from data/helm_capabilities/zai-org/glm-4.5-air-fp8/7b231b0d-89b8-4a0a-825e-ccfea212f565.json rename to data/helm_capabilities/zai-org/glm-4.5-air-fp8/eb2f8883-30ee-42e1-95b5-48dcf988ecf5.json index b4d4807f0..43a98dd63 100644 --- a/data/helm_capabilities/zai-org/glm-4.5-air-fp8/7b231b0d-89b8-4a0a-825e-ccfea212f565.json +++ b/data/helm_capabilities/zai-org/glm-4.5-air-fp8/eb2f8883-30ee-42e1-95b5-48dcf988ecf5.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_capabilities/zai-org_glm-4.5-air-fp8/1767657480.2939079", - "retrieved_timestamp": "1767657480.2939079", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_capabilities/zai-org_glm-4.5-air-fp8/1770835969.095764", + "retrieved_timestamp": "1770835969.095764", "source_metadata": { "source_name": "helm_capabilities", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean score", + "source_data": { + "dataset_name": "helm_capabilities", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "The mean of the scores from all columns.", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.67, "details": { - "description": null, "tab": "Accuracy", "Mean score - Efficiency": { "description": null, @@ -39,12 +42,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU-Pro - COT correct", + "evaluation_name": "MMLU-Pro", + "source_data": { + "dataset_name": "MMLU-Pro", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MMLU-Pro\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on MMLU-Pro", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -88,15 +100,24 @@ } }, "generation_config": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "all", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "GPQA - COT correct", + "evaluation_name": "GPQA", + "source_data": { + "dataset_name": "GPQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "GPQA\n\nCOT correct: Fraction of correct answers after chain of thought", + "evaluation_description": "COT correct on GPQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -140,15 +161,24 @@ } }, "generation_config": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" + "additional_details": { + "subset": "gpqa_main", + "use_chain_of_thought": "true", + "use_few_shot": "false" + } } }, { - "evaluation_name": "IFEval - IFEval Strict Acc", + "evaluation_name": "IFEval", + "source_data": { + "dataset_name": "IFEval", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "IFEval\n\nIFEval strict accuracy: Fraction of instructions in the instance that are correctly followed.", + "evaluation_description": "IFEval Strict Acc on IFEval", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -191,12 +221,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WildBench - WB Score", + "evaluation_name": "WildBench", + "source_data": { + "dataset_name": "WildBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WildBench\n\nWildBench Score: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.", + "evaluation_description": "WB Score on WildBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -240,13 +279,22 @@ } }, "generation_config": { - "subset": "v2" + "additional_details": { + "subset": "v2" + } } }, { - "evaluation_name": "Omni-MATH - Acc", + "evaluation_name": "Omni-MATH", + "source_data": { + "dataset_name": "Omni-MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "Omni-MATH\n\nOmni-MATH Accuracy: Accuracy of the AI output judged by GPT-4.", + "evaluation_description": "Acc on Omni-MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -289,7 +337,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/anthropic/Anthropic-LM-v4-s3-52B/efb110ab-85c5-49f5-af6f-9beecf46a7d4.json b/data/helm_classic/Anthropic-LM-v4-s3-52B/Anthropic-LM-v4-s3-52B/12fdea65-94eb-4c85-876c-65f0528bde12.json similarity index 89% rename from data/helm_classic/anthropic/Anthropic-LM-v4-s3-52B/efb110ab-85c5-49f5-af6f-9beecf46a7d4.json rename to data/helm_classic/Anthropic-LM-v4-s3-52B/Anthropic-LM-v4-s3-52B/12fdea65-94eb-4c85-876c-65f0528bde12.json index fac51642a..152223193 100644 --- a/data/helm_classic/anthropic/Anthropic-LM-v4-s3-52B/efb110ab-85c5-49f5-af6f-9beecf46a7d4.json +++ b/data/helm_classic/Anthropic-LM-v4-s3-52B/Anthropic-LM-v4-s3-52B/12fdea65-94eb-4c85-876c-65f0528bde12.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/anthropic_Anthropic-LM-v4-s3-52B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/Anthropic-LM-v4-s3-52B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -13,13 +10,20 @@ }, "model_info": { "name": "Anthropic-LM v4-s3 52B", - "id": "anthropic/Anthropic-LM-v4-s3-52B", - "developer": "anthropic", + "id": "Anthropic-LM-v4-s3-52B", + "developer": "unknown", "inference_platform": "unknown" }, "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.78, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,12 +77,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -142,12 +154,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -235,12 +256,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -328,12 +358,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -496,12 +535,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -589,12 +637,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -657,12 +714,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -725,12 +791,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -793,12 +868,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,12 +1035,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1074,12 +1167,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1197,12 +1299,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1290,12 +1401,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1383,12 +1503,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/ai21/J1-Grande-v1-17B/09f5c502-2950-48fb-b25f-b562eeee26c8.json b/data/helm_classic/ai21/J1-Grande-v1-17B/d3519b2c-2e74-4e5f-8e2a-ab13446d126a.json similarity index 89% rename from data/helm_classic/ai21/J1-Grande-v1-17B/09f5c502-2950-48fb-b25f-b562eeee26c8.json rename to data/helm_classic/ai21/J1-Grande-v1-17B/d3519b2c-2e74-4e5f-8e2a-ab13446d126a.json index 05d951313..6a9a41b41 100644 --- a/data/helm_classic/ai21/J1-Grande-v1-17B/09f5c502-2950-48fb-b25f-b562eeee26c8.json +++ b/data/helm_classic/ai21/J1-Grande-v1-17B/d3519b2c-2e74-4e5f-8e2a-ab13446d126a.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/ai21_J1-Grande-v1-17B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/ai21_J1-Grande-v1-17B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.433, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,12 +77,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -142,12 +154,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -235,12 +256,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -328,12 +358,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -496,12 +535,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -589,12 +637,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -657,12 +714,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -725,12 +791,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -793,12 +868,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,12 +1035,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1074,12 +1167,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1197,12 +1299,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1290,12 +1401,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1383,12 +1503,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/ai21/J1-Grande-v2-beta-17B/3d13f9ba-b18e-4b52-b28d-9aed0621903d.json b/data/helm_classic/ai21/J1-Grande-v2-beta-17B/1f2516b9-54b6-4dcf-a575-734c0d0b17b5.json similarity index 89% rename from data/helm_classic/ai21/J1-Grande-v2-beta-17B/3d13f9ba-b18e-4b52-b28d-9aed0621903d.json rename to data/helm_classic/ai21/J1-Grande-v2-beta-17B/1f2516b9-54b6-4dcf-a575-734c0d0b17b5.json index cc58c06c0..30c92ab94 100644 --- a/data/helm_classic/ai21/J1-Grande-v2-beta-17B/3d13f9ba-b18e-4b52-b28d-9aed0621903d.json +++ b/data/helm_classic/ai21/J1-Grande-v2-beta-17B/1f2516b9-54b6-4dcf-a575-734c0d0b17b5.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/ai21_J1-Grande-v2-beta-17B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/ai21_J1-Grande-v2-beta-17B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.706, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,12 +77,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -142,12 +154,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -235,12 +256,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -328,12 +358,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -496,12 +535,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -589,12 +637,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -657,12 +714,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -725,12 +791,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -793,12 +868,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,12 +1035,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1074,12 +1167,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1197,12 +1299,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1290,12 +1401,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1383,12 +1503,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/ai21/J1-Jumbo-v1-178B/3c427293-0f3d-4aa8-ac62-4ed484dd74eb.json b/data/helm_classic/ai21/J1-Jumbo-v1-178B/deddbc80-70ac-43e7-b052-753d127f8390.json similarity index 89% rename from data/helm_classic/ai21/J1-Jumbo-v1-178B/3c427293-0f3d-4aa8-ac62-4ed484dd74eb.json rename to data/helm_classic/ai21/J1-Jumbo-v1-178B/deddbc80-70ac-43e7-b052-753d127f8390.json index 0be03d012..df8111bcc 100644 --- a/data/helm_classic/ai21/J1-Jumbo-v1-178B/3c427293-0f3d-4aa8-ac62-4ed484dd74eb.json +++ b/data/helm_classic/ai21/J1-Jumbo-v1-178B/deddbc80-70ac-43e7-b052-753d127f8390.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/ai21_J1-Jumbo-v1-178B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/ai21_J1-Jumbo-v1-178B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.517, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,12 +77,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -142,12 +154,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -235,12 +256,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -328,12 +358,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -496,12 +535,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -589,12 +637,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -657,12 +714,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -725,12 +791,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -793,12 +868,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,12 +1035,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1074,12 +1167,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1197,12 +1299,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1290,12 +1401,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1383,12 +1503,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/ai21/J1-Large-v1-7.5B/1ab7f23a-7527-4188-9141-852f5123eb19.json b/data/helm_classic/ai21/J1-Large-v1-7.5B/e4780862-bf3c-4856-b1e7-02616afe931a.json similarity index 89% rename from data/helm_classic/ai21/J1-Large-v1-7.5B/1ab7f23a-7527-4188-9141-852f5123eb19.json rename to data/helm_classic/ai21/J1-Large-v1-7.5B/e4780862-bf3c-4856-b1e7-02616afe931a.json index 3239df52d..5c8560533 100644 --- a/data/helm_classic/ai21/J1-Large-v1-7.5B/1ab7f23a-7527-4188-9141-852f5123eb19.json +++ b/data/helm_classic/ai21/J1-Large-v1-7.5B/e4780862-bf3c-4856-b1e7-02616afe931a.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/ai21_J1-Large-v1-7.5B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/ai21_J1-Large-v1-7.5B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.285, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,12 +77,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -142,12 +154,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -235,12 +256,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -328,12 +358,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -496,12 +535,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -589,12 +637,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -657,12 +714,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -725,12 +791,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -793,12 +868,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,12 +1035,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1074,12 +1167,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1197,12 +1299,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1290,12 +1401,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1383,12 +1503,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/ai21/Jurassic-2-Grande-17B/f91e7178-50e2-4ad8-9ad5-2f37a29ee9e7.json b/data/helm_classic/ai21/Jurassic-2-Grande-17B/cd1ec0ed-44cb-4e99-b58d-f026c3172f8c.json similarity index 89% rename from data/helm_classic/ai21/Jurassic-2-Grande-17B/f91e7178-50e2-4ad8-9ad5-2f37a29ee9e7.json rename to data/helm_classic/ai21/Jurassic-2-Grande-17B/cd1ec0ed-44cb-4e99-b58d-f026c3172f8c.json index 71ff2dc38..4f288f894 100644 --- a/data/helm_classic/ai21/Jurassic-2-Grande-17B/f91e7178-50e2-4ad8-9ad5-2f37a29ee9e7.json +++ b/data/helm_classic/ai21/Jurassic-2-Grande-17B/cd1ec0ed-44cb-4e99-b58d-f026c3172f8c.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/ai21_Jurassic-2-Grande-17B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/ai21_Jurassic-2-Grande-17B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.743, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,12 +77,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -142,12 +154,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -235,12 +256,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -328,12 +358,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -496,12 +535,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -589,12 +637,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -657,12 +714,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -725,12 +791,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -793,12 +868,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,12 +1035,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1074,12 +1167,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1197,12 +1299,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1290,12 +1401,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1383,12 +1503,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/ai21/Jurassic-2-Jumbo-178B/ac5c97b3-3411-4f8d-9cb3-b6b0a540e3bd.json b/data/helm_classic/ai21/Jurassic-2-Jumbo-178B/13a22d40-f274-4384-adcc-1539da821c6a.json similarity index 89% rename from data/helm_classic/ai21/Jurassic-2-Jumbo-178B/ac5c97b3-3411-4f8d-9cb3-b6b0a540e3bd.json rename to data/helm_classic/ai21/Jurassic-2-Jumbo-178B/13a22d40-f274-4384-adcc-1539da821c6a.json index ab1f54c90..6d0308b9f 100644 --- a/data/helm_classic/ai21/Jurassic-2-Jumbo-178B/ac5c97b3-3411-4f8d-9cb3-b6b0a540e3bd.json +++ b/data/helm_classic/ai21/Jurassic-2-Jumbo-178B/13a22d40-f274-4384-adcc-1539da821c6a.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/ai21_Jurassic-2-Jumbo-178B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/ai21_Jurassic-2-Jumbo-178B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.824, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,12 +77,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -142,12 +154,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -235,12 +256,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -328,12 +358,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -496,12 +535,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -589,12 +637,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -657,12 +714,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -725,12 +791,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -793,12 +868,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,12 +1035,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1074,12 +1167,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1197,12 +1299,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1290,12 +1401,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1383,12 +1503,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/ai21/Jurassic-2-Large-7.5B/67114722-a441-478b-a324-2c32be7e06a7.json b/data/helm_classic/ai21/Jurassic-2-Large-7.5B/a01f642e-730b-461d-8afe-9c077ab3f149.json similarity index 89% rename from data/helm_classic/ai21/Jurassic-2-Large-7.5B/67114722-a441-478b-a324-2c32be7e06a7.json rename to data/helm_classic/ai21/Jurassic-2-Large-7.5B/a01f642e-730b-461d-8afe-9c077ab3f149.json index 14e3a243d..4278cef81 100644 --- a/data/helm_classic/ai21/Jurassic-2-Large-7.5B/67114722-a441-478b-a324-2c32be7e06a7.json +++ b/data/helm_classic/ai21/Jurassic-2-Large-7.5B/a01f642e-730b-461d-8afe-9c077ab3f149.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/ai21_Jurassic-2-Large-7.5B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/ai21_Jurassic-2-Large-7.5B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.553, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,12 +77,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -142,12 +154,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -235,12 +256,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -328,12 +358,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -496,12 +535,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -589,12 +637,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -657,12 +714,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -725,12 +791,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -793,12 +868,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,12 +1035,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1074,12 +1167,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1197,12 +1299,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1290,12 +1401,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1383,12 +1503,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/aleph-alpha/Luminous-Base-13B/07fa437f-398d-48ab-a74d-b8c59caf3add.json b/data/helm_classic/aleph-alpha/Luminous-Base-13B/813802a3-483e-443d-9e49-7cd581b5ea6d.json similarity index 89% rename from data/helm_classic/aleph-alpha/Luminous-Base-13B/07fa437f-398d-48ab-a74d-b8c59caf3add.json rename to data/helm_classic/aleph-alpha/Luminous-Base-13B/813802a3-483e-443d-9e49-7cd581b5ea6d.json index 9fccefc67..7e02805f7 100644 --- a/data/helm_classic/aleph-alpha/Luminous-Base-13B/07fa437f-398d-48ab-a74d-b8c59caf3add.json +++ b/data/helm_classic/aleph-alpha/Luminous-Base-13B/813802a3-483e-443d-9e49-7cd581b5ea6d.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/aleph-alpha_Luminous-Base-13B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/aleph-alpha_Luminous-Base-13B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.315, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,12 +77,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -142,12 +154,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -235,12 +256,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -328,12 +358,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -496,12 +535,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -589,12 +637,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -657,12 +714,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -725,12 +791,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -793,12 +868,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,12 +1035,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1074,12 +1167,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1197,12 +1299,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1290,12 +1401,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1383,12 +1503,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/aleph-alpha/Luminous-Extended-30B/7492964a-2c16-4261-aaca-dbcd4f3be7c3.json b/data/helm_classic/aleph-alpha/Luminous-Extended-30B/90e7bfa7-af3a-4979-b0d1-9d75db6e4e30.json similarity index 89% rename from data/helm_classic/aleph-alpha/Luminous-Extended-30B/7492964a-2c16-4261-aaca-dbcd4f3be7c3.json rename to data/helm_classic/aleph-alpha/Luminous-Extended-30B/90e7bfa7-af3a-4979-b0d1-9d75db6e4e30.json index 9f9536338..d6f8fa8ea 100644 --- a/data/helm_classic/aleph-alpha/Luminous-Extended-30B/7492964a-2c16-4261-aaca-dbcd4f3be7c3.json +++ b/data/helm_classic/aleph-alpha/Luminous-Extended-30B/90e7bfa7-af3a-4979-b0d1-9d75db6e4e30.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/aleph-alpha_Luminous-Extended-30B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/aleph-alpha_Luminous-Extended-30B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.485, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,12 +77,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -142,12 +154,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -235,12 +256,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -328,12 +358,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -496,12 +535,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -589,12 +637,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -657,12 +714,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -725,12 +791,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -793,12 +868,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,12 +1035,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1074,12 +1167,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1197,12 +1299,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1290,12 +1401,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1383,12 +1503,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/aleph-alpha/Luminous-Supreme-70B/b5dace02-416d-4b90-90e1-562b22820784.json b/data/helm_classic/aleph-alpha/Luminous-Supreme-70B/d113c21d-7c89-4cde-98b8-0c2f8d03fdf6.json similarity index 89% rename from data/helm_classic/aleph-alpha/Luminous-Supreme-70B/b5dace02-416d-4b90-90e1-562b22820784.json rename to data/helm_classic/aleph-alpha/Luminous-Supreme-70B/d113c21d-7c89-4cde-98b8-0c2f8d03fdf6.json index ed0fa9dcd..5680298fb 100644 --- a/data/helm_classic/aleph-alpha/Luminous-Supreme-70B/b5dace02-416d-4b90-90e1-562b22820784.json +++ b/data/helm_classic/aleph-alpha/Luminous-Supreme-70B/d113c21d-7c89-4cde-98b8-0c2f8d03fdf6.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/aleph-alpha_Luminous-Supreme-70B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/aleph-alpha_Luminous-Supreme-70B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.662, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,12 +77,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -142,12 +154,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -235,12 +256,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -328,12 +358,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -496,12 +535,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -589,12 +637,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -657,12 +714,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -725,12 +791,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -793,12 +868,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,12 +1035,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1074,12 +1167,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1197,12 +1299,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1290,12 +1401,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1383,12 +1503,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/bigscience/BLOOM-176B/0e6cd483-dff8-4fba-9239-82cb0fe34d42.json b/data/helm_classic/bigscience/BLOOM-176B/3dc29785-a884-4496-a6f4-a8bf19892e50.json similarity index 89% rename from data/helm_classic/bigscience/BLOOM-176B/0e6cd483-dff8-4fba-9239-82cb0fe34d42.json rename to data/helm_classic/bigscience/BLOOM-176B/3dc29785-a884-4496-a6f4-a8bf19892e50.json index 19831593f..caffd542e 100644 --- a/data/helm_classic/bigscience/BLOOM-176B/0e6cd483-dff8-4fba-9239-82cb0fe34d42.json +++ b/data/helm_classic/bigscience/BLOOM-176B/3dc29785-a884-4496-a6f4-a8bf19892e50.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/bigscience_BLOOM-176B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/bigscience_BLOOM-176B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.446, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,12 +77,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -142,12 +154,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -235,12 +256,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -328,12 +358,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -496,12 +535,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -589,12 +637,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -657,12 +714,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -725,12 +791,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -793,12 +868,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,12 +1035,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1074,12 +1167,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1197,12 +1299,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1290,12 +1401,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1383,12 +1503,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/bigscience/T0pp-11B/9ae59291-604f-4527-812a-a3150a1098f2.json b/data/helm_classic/bigscience/T0pp-11B/ff8dc291-bbaf-4149-854e-e1780b0c86d5.json similarity index 90% rename from data/helm_classic/bigscience/T0pp-11B/9ae59291-604f-4527-812a-a3150a1098f2.json rename to data/helm_classic/bigscience/T0pp-11B/ff8dc291-bbaf-4149-854e-e1780b0c86d5.json index af37640ca..400f064d5 100644 --- a/data/helm_classic/bigscience/T0pp-11B/9ae59291-604f-4527-812a-a3150a1098f2.json +++ b/data/helm_classic/bigscience/T0pp-11B/ff8dc291-bbaf-4149-854e-e1780b0c86d5.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/bigscience_T0pp-11B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/bigscience_T0pp-11B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.197, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,12 +77,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -142,12 +154,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -235,12 +256,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -328,12 +358,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -496,12 +535,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -589,12 +637,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -657,12 +714,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -725,12 +791,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -793,12 +868,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,12 +1035,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1074,12 +1167,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1197,12 +1299,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1290,12 +1401,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1383,12 +1503,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/cohere/Cohere-Command-beta-52.4B/52026df3-2452-4fd2-a10b-73a2bfc5397e.json b/data/helm_classic/cohere/Cohere-Command-beta-52.4B/b8932181-b669-4b0e-8879-1dfbf9afea12.json similarity index 89% rename from data/helm_classic/cohere/Cohere-Command-beta-52.4B/52026df3-2452-4fd2-a10b-73a2bfc5397e.json rename to data/helm_classic/cohere/Cohere-Command-beta-52.4B/b8932181-b669-4b0e-8879-1dfbf9afea12.json index 5eb323191..25f29c7e2 100644 --- a/data/helm_classic/cohere/Cohere-Command-beta-52.4B/52026df3-2452-4fd2-a10b-73a2bfc5397e.json +++ b/data/helm_classic/cohere/Cohere-Command-beta-52.4B/b8932181-b669-4b0e-8879-1dfbf9afea12.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/cohere_Cohere-Command-beta-52.4B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/cohere_Cohere-Command-beta-52.4B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.874, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,12 +77,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -142,12 +154,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -235,12 +256,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -328,12 +358,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -496,12 +535,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -589,12 +637,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -657,12 +714,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -725,12 +791,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -793,12 +868,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,12 +1035,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1074,12 +1167,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1197,12 +1299,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1290,12 +1401,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1383,12 +1503,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/cohere/Cohere-Command-beta-6.1B/19b97859-5af3-4883-a878-93d026c29d87.json b/data/helm_classic/cohere/Cohere-Command-beta-6.1B/c8f6f90c-39f6-4685-9d2d-8964c3d2ba02.json similarity index 89% rename from data/helm_classic/cohere/Cohere-Command-beta-6.1B/19b97859-5af3-4883-a878-93d026c29d87.json rename to data/helm_classic/cohere/Cohere-Command-beta-6.1B/c8f6f90c-39f6-4685-9d2d-8964c3d2ba02.json index d20d6332d..8f01acff1 100644 --- a/data/helm_classic/cohere/Cohere-Command-beta-6.1B/19b97859-5af3-4883-a878-93d026c29d87.json +++ b/data/helm_classic/cohere/Cohere-Command-beta-6.1B/c8f6f90c-39f6-4685-9d2d-8964c3d2ba02.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/cohere_Cohere-Command-beta-6.1B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/cohere_Cohere-Command-beta-6.1B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.675, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,12 +77,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -142,12 +154,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -235,12 +256,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -328,12 +358,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -496,12 +535,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -589,12 +637,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -657,12 +714,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -725,12 +791,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -793,12 +868,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,12 +1035,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1074,12 +1167,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1197,12 +1299,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1290,12 +1401,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1383,12 +1503,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/cohere/Cohere-large-v20220720-13.1B/37af5185-3599-49f5-9637-55d41bc6ae81.json b/data/helm_classic/cohere/Cohere-large-v20220720-13.1B/579fb908-3c36-4ff8-a262-fd5388806b83.json similarity index 89% rename from data/helm_classic/cohere/Cohere-large-v20220720-13.1B/37af5185-3599-49f5-9637-55d41bc6ae81.json rename to data/helm_classic/cohere/Cohere-large-v20220720-13.1B/579fb908-3c36-4ff8-a262-fd5388806b83.json index 54182b504..16c06b937 100644 --- a/data/helm_classic/cohere/Cohere-large-v20220720-13.1B/37af5185-3599-49f5-9637-55d41bc6ae81.json +++ b/data/helm_classic/cohere/Cohere-large-v20220720-13.1B/579fb908-3c36-4ff8-a262-fd5388806b83.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/cohere_Cohere-large-v20220720-13.1B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/cohere_Cohere-large-v20220720-13.1B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.372, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,12 +77,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -142,12 +154,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -235,12 +256,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -328,12 +358,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -496,12 +535,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -589,12 +637,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -657,12 +714,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -725,12 +791,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -793,12 +868,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,12 +1035,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1074,12 +1167,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1197,12 +1299,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1290,12 +1401,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1383,12 +1503,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/cohere/Cohere-medium-v20220720-6.1B/cf32b49f-7cf8-43a3-9e28-ade7446272ab.json b/data/helm_classic/cohere/Cohere-medium-v20220720-6.1B/68ff9f10-0357-4ea8-b758-de6c7f51d669.json similarity index 89% rename from data/helm_classic/cohere/Cohere-medium-v20220720-6.1B/cf32b49f-7cf8-43a3-9e28-ade7446272ab.json rename to data/helm_classic/cohere/Cohere-medium-v20220720-6.1B/68ff9f10-0357-4ea8-b758-de6c7f51d669.json index ecba92b3a..f0d42b850 100644 --- a/data/helm_classic/cohere/Cohere-medium-v20220720-6.1B/cf32b49f-7cf8-43a3-9e28-ade7446272ab.json +++ b/data/helm_classic/cohere/Cohere-medium-v20220720-6.1B/68ff9f10-0357-4ea8-b758-de6c7f51d669.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/cohere_Cohere-medium-v20220720-6.1B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/cohere_Cohere-medium-v20220720-6.1B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.23, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,12 +77,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -142,12 +154,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -235,12 +256,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -328,12 +358,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -496,12 +535,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -589,12 +637,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -657,12 +714,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -725,12 +791,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -793,12 +868,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,12 +1035,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1074,12 +1167,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1197,12 +1299,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1290,12 +1401,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1383,12 +1503,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/cohere/Cohere-medium-v20221108-6.1B/ad9bd354-01d9-4a21-a299-a53190e1eb7e.json b/data/helm_classic/cohere/Cohere-medium-v20221108-6.1B/b1ecc2b8-6461-4d70-b639-df3dc2594a5b.json similarity index 89% rename from data/helm_classic/cohere/Cohere-medium-v20221108-6.1B/ad9bd354-01d9-4a21-a299-a53190e1eb7e.json rename to data/helm_classic/cohere/Cohere-medium-v20221108-6.1B/b1ecc2b8-6461-4d70-b639-df3dc2594a5b.json index 0b33b0763..43f986e70 100644 --- a/data/helm_classic/cohere/Cohere-medium-v20221108-6.1B/ad9bd354-01d9-4a21-a299-a53190e1eb7e.json +++ b/data/helm_classic/cohere/Cohere-medium-v20221108-6.1B/b1ecc2b8-6461-4d70-b639-df3dc2594a5b.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/cohere_Cohere-medium-v20221108-6.1B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/cohere_Cohere-medium-v20221108-6.1B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.312, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,12 +77,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -142,12 +154,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -235,12 +256,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -328,12 +358,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -496,12 +535,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -589,12 +637,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -657,12 +714,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -725,12 +791,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -793,12 +868,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,12 +1035,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1074,12 +1167,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1197,12 +1299,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1290,12 +1401,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1383,12 +1503,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/cohere/Cohere-small-v20220720-410M/12e7dc67-ae33-4f8c-b7df-7cd7d1b58694.json b/data/helm_classic/cohere/Cohere-small-v20220720-410M/8e4f9ef2-8423-491d-b5e9-06128eb8fd32.json similarity index 89% rename from data/helm_classic/cohere/Cohere-small-v20220720-410M/12e7dc67-ae33-4f8c-b7df-7cd7d1b58694.json rename to data/helm_classic/cohere/Cohere-small-v20220720-410M/8e4f9ef2-8423-491d-b5e9-06128eb8fd32.json index 4abc0c79b..adaaa9403 100644 --- a/data/helm_classic/cohere/Cohere-small-v20220720-410M/12e7dc67-ae33-4f8c-b7df-7cd7d1b58694.json +++ b/data/helm_classic/cohere/Cohere-small-v20220720-410M/8e4f9ef2-8423-491d-b5e9-06128eb8fd32.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/cohere_Cohere-small-v20220720-410M/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/cohere_Cohere-small-v20220720-410M/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.109, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,12 +77,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -142,12 +154,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -235,12 +256,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -328,12 +358,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -496,12 +535,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -589,12 +637,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -657,12 +714,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -725,12 +791,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -793,12 +868,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,12 +1035,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1074,12 +1167,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1197,12 +1299,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1290,12 +1401,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1383,12 +1503,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/cohere/Cohere-xlarge-v20220609-52.4B/ce112061-bfa6-4c71-a0f5-3c7f3cf1a560.json b/data/helm_classic/cohere/Cohere-xlarge-v20220609-52.4B/8d2665d6-55fb-4d0c-8d6d-48cd43f27ff2.json similarity index 89% rename from data/helm_classic/cohere/Cohere-xlarge-v20220609-52.4B/ce112061-bfa6-4c71-a0f5-3c7f3cf1a560.json rename to data/helm_classic/cohere/Cohere-xlarge-v20220609-52.4B/8d2665d6-55fb-4d0c-8d6d-48cd43f27ff2.json index 6c362be4c..80b637746 100644 --- a/data/helm_classic/cohere/Cohere-xlarge-v20220609-52.4B/ce112061-bfa6-4c71-a0f5-3c7f3cf1a560.json +++ b/data/helm_classic/cohere/Cohere-xlarge-v20220609-52.4B/8d2665d6-55fb-4d0c-8d6d-48cd43f27ff2.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/cohere_Cohere-xlarge-v20220609-52.4B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/cohere_Cohere-xlarge-v20220609-52.4B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.56, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,12 +77,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -142,12 +154,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -235,12 +256,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -328,12 +358,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -496,12 +535,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -589,12 +637,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -657,12 +714,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -725,12 +791,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -793,12 +868,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,12 +1035,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1074,12 +1167,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1197,12 +1299,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1290,12 +1401,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1383,12 +1503,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/cohere/Cohere-xlarge-v20221108-52.4B/d75d1c98-226a-42cb-9bf3-a8e59ba7f971.json b/data/helm_classic/cohere/Cohere-xlarge-v20221108-52.4B/6bbe052f-46f7-4541-80a3-dbb86433db7a.json similarity index 89% rename from data/helm_classic/cohere/Cohere-xlarge-v20221108-52.4B/d75d1c98-226a-42cb-9bf3-a8e59ba7f971.json rename to data/helm_classic/cohere/Cohere-xlarge-v20221108-52.4B/6bbe052f-46f7-4541-80a3-dbb86433db7a.json index f92b78094..cc49de0c7 100644 --- a/data/helm_classic/cohere/Cohere-xlarge-v20221108-52.4B/d75d1c98-226a-42cb-9bf3-a8e59ba7f971.json +++ b/data/helm_classic/cohere/Cohere-xlarge-v20221108-52.4B/6bbe052f-46f7-4541-80a3-dbb86433db7a.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/cohere_Cohere-xlarge-v20221108-52.4B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/cohere_Cohere-xlarge-v20221108-52.4B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.664, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,12 +77,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -142,12 +154,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -235,12 +256,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -328,12 +358,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -496,12 +535,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -589,12 +637,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -657,12 +714,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -725,12 +791,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -793,12 +868,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,12 +1035,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1074,12 +1167,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1197,12 +1299,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1290,12 +1401,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1383,12 +1503,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/eleuther-ai/Pythia-12B/1143ee64-20a0-41f5-a5fb-35e620889662.json b/data/helm_classic/eleutherai/Pythia-12B/9b91f415-6edf-4a2f-a3ff-a9dac8343ebd.json similarity index 88% rename from data/helm_classic/eleuther-ai/Pythia-12B/1143ee64-20a0-41f5-a5fb-35e620889662.json rename to data/helm_classic/eleutherai/Pythia-12B/9b91f415-6edf-4a2f-a3ff-a9dac8343ebd.json index f6f9d6eae..bc304945b 100644 --- a/data/helm_classic/eleuther-ai/Pythia-12B/1143ee64-20a0-41f5-a5fb-35e620889662.json +++ b/data/helm_classic/eleutherai/Pythia-12B/9b91f415-6edf-4a2f-a3ff-a9dac8343ebd.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/eleuther-ai_Pythia-12B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/eleutherai_Pythia-12B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -13,13 +10,20 @@ }, "model_info": { "name": "Pythia 12B", - "id": "eleuther-ai/Pythia-12B", - "developer": "eleuther-ai", + "id": "eleutherai/Pythia-12B", + "developer": "eleutherai", "inference_platform": "unknown" }, "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.257, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,12 +77,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -142,12 +154,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -235,12 +256,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -328,12 +358,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -496,12 +535,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -589,12 +637,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -657,12 +714,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -725,12 +791,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -793,12 +868,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,12 +1035,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1074,12 +1167,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1197,12 +1299,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1290,12 +1401,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1383,12 +1503,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/eleuther-ai/Pythia-6.9B/b454af07-11be-48b4-a3c2-032716cdf250.json b/data/helm_classic/eleutherai/Pythia-6.9B/742a59e8-c813-42ef-938a-4897e25dcdad.json similarity index 88% rename from data/helm_classic/eleuther-ai/Pythia-6.9B/b454af07-11be-48b4-a3c2-032716cdf250.json rename to data/helm_classic/eleutherai/Pythia-6.9B/742a59e8-c813-42ef-938a-4897e25dcdad.json index 2b488fa6f..511816a71 100644 --- a/data/helm_classic/eleuther-ai/Pythia-6.9B/b454af07-11be-48b4-a3c2-032716cdf250.json +++ b/data/helm_classic/eleutherai/Pythia-6.9B/742a59e8-c813-42ef-938a-4897e25dcdad.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/eleuther-ai_Pythia-6.9B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/eleutherai_Pythia-6.9B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -13,13 +10,20 @@ }, "model_info": { "name": "Pythia 6.9B", - "id": "eleuther-ai/Pythia-6.9B", - "developer": "eleuther-ai", + "id": "eleutherai/Pythia-6.9B", + "developer": "eleutherai", "inference_platform": "unknown" }, "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.196, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,12 +77,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -142,12 +154,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -235,12 +256,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -328,12 +358,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -496,12 +535,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -589,12 +637,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -657,12 +714,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -725,12 +791,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -793,12 +868,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,12 +1035,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1074,12 +1167,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1197,12 +1299,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1290,12 +1401,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1383,12 +1503,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/writer/Palmyra-X-43B/89ef08bb-e26e-4073-9179-79cd08f3bb4b.json b/data/helm_classic/google/Palmyra-X-43B/5dec6a7d-2710-49c2-889d-c7b8ee203ce4.json similarity index 88% rename from data/helm_classic/writer/Palmyra-X-43B/89ef08bb-e26e-4073-9179-79cd08f3bb4b.json rename to data/helm_classic/google/Palmyra-X-43B/5dec6a7d-2710-49c2-889d-c7b8ee203ce4.json index 725954e16..8d33e45b6 100644 --- a/data/helm_classic/writer/Palmyra-X-43B/89ef08bb-e26e-4073-9179-79cd08f3bb4b.json +++ b/data/helm_classic/google/Palmyra-X-43B/5dec6a7d-2710-49c2-889d-c7b8ee203ce4.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/writer_Palmyra-X-43B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/google_Palmyra-X-43B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -13,13 +10,20 @@ }, "model_info": { "name": "Palmyra X 43B", - "id": "writer/Palmyra-X-43B", - "developer": "writer", + "id": "google/Palmyra-X-43B", + "developer": "google", "inference_platform": "unknown" }, "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.732, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,12 +77,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -142,12 +154,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -235,12 +256,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -328,12 +358,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -496,12 +535,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -589,12 +637,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -657,12 +714,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -725,12 +791,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -793,12 +868,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,12 +1035,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1074,12 +1167,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1197,12 +1299,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1290,12 +1401,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1383,12 +1503,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/google/T5-11B/df0694c5-fca3-48dc-8c6a-0ed477fa08f5.json b/data/helm_classic/google/T5-11B/509360bc-86f5-49dc-899c-2899d8b6bc6c.json similarity index 89% rename from data/helm_classic/google/T5-11B/df0694c5-fca3-48dc-8c6a-0ed477fa08f5.json rename to data/helm_classic/google/T5-11B/509360bc-86f5-49dc-899c-2899d8b6bc6c.json index 9bacd9bf9..2a710defd 100644 --- a/data/helm_classic/google/T5-11B/df0694c5-fca3-48dc-8c6a-0ed477fa08f5.json +++ b/data/helm_classic/google/T5-11B/509360bc-86f5-49dc-899c-2899d8b6bc6c.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/google_T5-11B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/google_T5-11B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.131, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,12 +77,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -142,12 +154,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -235,12 +256,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -328,12 +358,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -496,12 +535,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -589,12 +637,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -657,12 +714,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -725,12 +791,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -793,12 +868,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,12 +1035,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1074,12 +1167,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1197,12 +1299,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1290,12 +1401,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1383,12 +1503,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/google/UL2-20B/ac49ac68-0d7f-4972-bb99-0332b14df2d5.json b/data/helm_classic/google/UL2-20B/8f54f091-46d0-4a9a-9b22-a97a7e3972c0.json similarity index 89% rename from data/helm_classic/google/UL2-20B/ac49ac68-0d7f-4972-bb99-0332b14df2d5.json rename to data/helm_classic/google/UL2-20B/8f54f091-46d0-4a9a-9b22-a97a7e3972c0.json index c9bf42a12..bb571aece 100644 --- a/data/helm_classic/google/UL2-20B/ac49ac68-0d7f-4972-bb99-0332b14df2d5.json +++ b/data/helm_classic/google/UL2-20B/8f54f091-46d0-4a9a-9b22-a97a7e3972c0.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/google_UL2-20B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/google_UL2-20B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.167, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,12 +77,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -142,12 +154,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -235,12 +256,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -328,12 +358,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -496,12 +535,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -589,12 +637,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -657,12 +714,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -725,12 +791,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -793,12 +868,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,12 +1035,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1074,12 +1167,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1197,12 +1299,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1290,12 +1401,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1383,12 +1503,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/lmsys/Vicuna-v1.3-13B/39f4648c-6635-4ffa-86f5-040e69f3e054.json b/data/helm_classic/lmsys/Vicuna-v1.3-13B/8f152c7d-5fba-476e-82c1-4f34a6e7d7e0.json similarity index 88% rename from data/helm_classic/lmsys/Vicuna-v1.3-13B/39f4648c-6635-4ffa-86f5-040e69f3e054.json rename to data/helm_classic/lmsys/Vicuna-v1.3-13B/8f152c7d-5fba-476e-82c1-4f34a6e7d7e0.json index 65a179431..e1d9662a3 100644 --- a/data/helm_classic/lmsys/Vicuna-v1.3-13B/39f4648c-6635-4ffa-86f5-040e69f3e054.json +++ b/data/helm_classic/lmsys/Vicuna-v1.3-13B/8f152c7d-5fba-476e-82c1-4f34a6e7d7e0.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/lmsys_Vicuna-v1.3-13B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/lmsys_Vicuna-v1.3-13B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.706, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,12 +77,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -142,12 +154,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -235,12 +256,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -328,12 +358,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -496,12 +535,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -589,12 +637,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -657,12 +714,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -725,12 +791,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -793,12 +868,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,12 +1035,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1074,12 +1167,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1197,12 +1299,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1290,12 +1401,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1383,12 +1503,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/lmsys/Vicuna-v1.3-7B/4ef38a9d-283c-4549-8de3-d04ce7f62542.json b/data/helm_classic/lmsys/Vicuna-v1.3-7B/7c0d2405-f12e-4a3b-924f-1b2a86fd4eae.json similarity index 88% rename from data/helm_classic/lmsys/Vicuna-v1.3-7B/4ef38a9d-283c-4549-8de3-d04ce7f62542.json rename to data/helm_classic/lmsys/Vicuna-v1.3-7B/7c0d2405-f12e-4a3b-924f-1b2a86fd4eae.json index bf5b7f8ab..b03d7afe6 100644 --- a/data/helm_classic/lmsys/Vicuna-v1.3-7B/4ef38a9d-283c-4549-8de3-d04ce7f62542.json +++ b/data/helm_classic/lmsys/Vicuna-v1.3-7B/7c0d2405-f12e-4a3b-924f-1b2a86fd4eae.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/lmsys_Vicuna-v1.3-7B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/lmsys_Vicuna-v1.3-7B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.625, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,12 +77,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -142,12 +154,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -235,12 +256,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -328,12 +358,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -496,12 +535,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -589,12 +637,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -657,12 +714,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -725,12 +791,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -793,12 +868,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,12 +1035,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1074,12 +1167,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1197,12 +1299,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1290,12 +1401,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1383,12 +1503,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/meta/LLaMA-13B/81eee874-47be-4a55-af47-5b3e1bcbd361.json b/data/helm_classic/meta/LLaMA-13B/d65d8f48-8b8e-4ec6-af68-f61af5408adf.json similarity index 88% rename from data/helm_classic/meta/LLaMA-13B/81eee874-47be-4a55-af47-5b3e1bcbd361.json rename to data/helm_classic/meta/LLaMA-13B/d65d8f48-8b8e-4ec6-af68-f61af5408adf.json index b007605c7..959b52195 100644 --- a/data/helm_classic/meta/LLaMA-13B/81eee874-47be-4a55-af47-5b3e1bcbd361.json +++ b/data/helm_classic/meta/LLaMA-13B/d65d8f48-8b8e-4ec6-af68-f61af5408adf.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/meta_LLaMA-13B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/meta_LLaMA-13B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.595, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,12 +77,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -142,12 +154,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -235,12 +256,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -328,12 +358,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -496,12 +535,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -589,12 +637,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -657,12 +714,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -725,12 +791,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -793,12 +868,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,12 +1035,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1074,12 +1167,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1197,12 +1299,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1290,12 +1401,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1383,12 +1503,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/meta/LLaMA-30B/2a23b568-daed-4783-9c51-5218216f5f19.json b/data/helm_classic/meta/LLaMA-30B/dff69882-cb8b-4323-b587-60f295085459.json similarity index 88% rename from data/helm_classic/meta/LLaMA-30B/2a23b568-daed-4783-9c51-5218216f5f19.json rename to data/helm_classic/meta/LLaMA-30B/dff69882-cb8b-4323-b587-60f295085459.json index 8e6647f52..7f604e015 100644 --- a/data/helm_classic/meta/LLaMA-30B/2a23b568-daed-4783-9c51-5218216f5f19.json +++ b/data/helm_classic/meta/LLaMA-30B/dff69882-cb8b-4323-b587-60f295085459.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/meta_LLaMA-30B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/meta_LLaMA-30B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.781, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,12 +77,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -142,12 +154,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -235,12 +256,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -328,12 +358,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -496,12 +535,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -589,12 +637,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -657,12 +714,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -725,12 +791,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -793,12 +868,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,12 +1035,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1074,12 +1167,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1197,12 +1299,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1290,12 +1401,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1383,12 +1503,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/meta/LLaMA-65B/584cb697-ab7c-4e9a-8eea-6d79d81a9d7e.json b/data/helm_classic/meta/LLaMA-65B/90220411-5e4d-4b74-a74c-ca2ad030d50e.json similarity index 88% rename from data/helm_classic/meta/LLaMA-65B/584cb697-ab7c-4e9a-8eea-6d79d81a9d7e.json rename to data/helm_classic/meta/LLaMA-65B/90220411-5e4d-4b74-a74c-ca2ad030d50e.json index 1dbaa6d85..ad8c1c451 100644 --- a/data/helm_classic/meta/LLaMA-65B/584cb697-ab7c-4e9a-8eea-6d79d81a9d7e.json +++ b/data/helm_classic/meta/LLaMA-65B/90220411-5e4d-4b74-a74c-ca2ad030d50e.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/meta_LLaMA-65B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/meta_LLaMA-65B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.908, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,12 +77,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -142,12 +154,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -235,12 +256,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -328,12 +358,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -496,12 +535,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -589,12 +637,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -657,12 +714,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -725,12 +791,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -793,12 +868,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,12 +1035,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1074,12 +1167,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1197,12 +1299,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1290,12 +1401,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1383,12 +1503,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/meta/LLaMA-7B/6a2445e0-75d4-4434-aabd-645fd445a920.json b/data/helm_classic/meta/LLaMA-7B/8c2465b2-deca-476c-bb41-836685ceab35.json similarity index 88% rename from data/helm_classic/meta/LLaMA-7B/6a2445e0-75d4-4434-aabd-645fd445a920.json rename to data/helm_classic/meta/LLaMA-7B/8c2465b2-deca-476c-bb41-836685ceab35.json index 4a772fb18..152b9e683 100644 --- a/data/helm_classic/meta/LLaMA-7B/6a2445e0-75d4-4434-aabd-645fd445a920.json +++ b/data/helm_classic/meta/LLaMA-7B/8c2465b2-deca-476c-bb41-836685ceab35.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/meta_LLaMA-7B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/meta_LLaMA-7B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.533, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,12 +77,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -142,12 +154,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -235,12 +256,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -328,12 +358,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -496,12 +535,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -589,12 +637,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -657,12 +714,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -725,12 +791,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -793,12 +868,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,12 +1035,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1074,12 +1167,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1197,12 +1299,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1290,12 +1401,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1383,12 +1503,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/meta/Llama-2-13B/f5d57067-8a00-490f-b1bf-30afd0b0f126.json b/data/helm_classic/meta/Llama-2-13B/4b0f6a03-1054-4047-82d1-53992f0378ee.json similarity index 88% rename from data/helm_classic/meta/Llama-2-13B/f5d57067-8a00-490f-b1bf-30afd0b0f126.json rename to data/helm_classic/meta/Llama-2-13B/4b0f6a03-1054-4047-82d1-53992f0378ee.json index de40c742e..f2cd54e60 100644 --- a/data/helm_classic/meta/Llama-2-13B/f5d57067-8a00-490f-b1bf-30afd0b0f126.json +++ b/data/helm_classic/meta/Llama-2-13B/4b0f6a03-1054-4047-82d1-53992f0378ee.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/meta_Llama-2-13B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/meta_Llama-2-13B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.823, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,12 +77,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -142,12 +154,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -235,12 +256,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -328,12 +358,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -496,12 +535,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -589,12 +637,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -657,12 +714,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -725,12 +791,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -793,12 +868,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,12 +1035,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1074,12 +1167,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1197,12 +1299,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1290,12 +1401,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1383,12 +1503,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/meta/Llama-2-70B/cb8802af-613e-42a1-b025-31532996eb10.json b/data/helm_classic/meta/Llama-2-70B/78bc128a-6e53-4086-9498-2b3428e1d884.json similarity index 88% rename from data/helm_classic/meta/Llama-2-70B/cb8802af-613e-42a1-b025-31532996eb10.json rename to data/helm_classic/meta/Llama-2-70B/78bc128a-6e53-4086-9498-2b3428e1d884.json index 77f6938f9..de031e670 100644 --- a/data/helm_classic/meta/Llama-2-70B/cb8802af-613e-42a1-b025-31532996eb10.json +++ b/data/helm_classic/meta/Llama-2-70B/78bc128a-6e53-4086-9498-2b3428e1d884.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/meta_Llama-2-70B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/meta_Llama-2-70B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.944, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,12 +77,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -142,12 +154,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -235,12 +256,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -328,12 +358,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -496,12 +535,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -589,12 +637,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -657,12 +714,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -725,12 +791,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -793,12 +868,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,12 +1035,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1074,12 +1167,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1197,12 +1299,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1290,12 +1401,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1383,12 +1503,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/meta/Llama-2-7B/ff02bc45-8476-4ea6-96d8-78ff6a0e0064.json b/data/helm_classic/meta/Llama-2-7B/2be7887e-6c91-437c-bbfc-8b68de3330da.json similarity index 88% rename from data/helm_classic/meta/Llama-2-7B/ff02bc45-8476-4ea6-96d8-78ff6a0e0064.json rename to data/helm_classic/meta/Llama-2-7B/2be7887e-6c91-437c-bbfc-8b68de3330da.json index 3b3b39208..eac315fea 100644 --- a/data/helm_classic/meta/Llama-2-7B/ff02bc45-8476-4ea6-96d8-78ff6a0e0064.json +++ b/data/helm_classic/meta/Llama-2-7B/2be7887e-6c91-437c-bbfc-8b68de3330da.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/meta_Llama-2-7B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/meta_Llama-2-7B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.607, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,12 +77,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -142,12 +154,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -235,12 +256,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -328,12 +358,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -496,12 +535,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -589,12 +637,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -657,12 +714,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -725,12 +791,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -793,12 +868,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,12 +1035,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1074,12 +1167,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1197,12 +1299,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1290,12 +1401,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1383,12 +1503,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/meta/OPT-175B/75a5843f-73a4-4ff3-94b5-184152ff703c.json b/data/helm_classic/meta/OPT-175B/f135ce21-655f-4ebf-9cc6-d83ada0f177b.json similarity index 89% rename from data/helm_classic/meta/OPT-175B/75a5843f-73a4-4ff3-94b5-184152ff703c.json rename to data/helm_classic/meta/OPT-175B/f135ce21-655f-4ebf-9cc6-d83ada0f177b.json index 0da99434e..63a0c348d 100644 --- a/data/helm_classic/meta/OPT-175B/75a5843f-73a4-4ff3-94b5-184152ff703c.json +++ b/data/helm_classic/meta/OPT-175B/f135ce21-655f-4ebf-9cc6-d83ada0f177b.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/meta_OPT-175B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/meta_OPT-175B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.609, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,12 +77,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -142,12 +154,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -235,12 +256,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -328,12 +358,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -496,12 +535,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -589,12 +637,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -657,12 +714,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -725,12 +791,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -793,12 +868,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,12 +1035,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1074,12 +1167,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1197,12 +1299,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1290,12 +1401,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1383,12 +1503,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/meta/OPT-66B/83d19197-aebd-43fa-a7ed-20818a9e5d8e.json b/data/helm_classic/meta/OPT-66B/48912a61-af54-4208-b36d-2f3a283e5c5d.json similarity index 89% rename from data/helm_classic/meta/OPT-66B/83d19197-aebd-43fa-a7ed-20818a9e5d8e.json rename to data/helm_classic/meta/OPT-66B/48912a61-af54-4208-b36d-2f3a283e5c5d.json index 929a020d2..2f3d2ad96 100644 --- a/data/helm_classic/meta/OPT-66B/83d19197-aebd-43fa-a7ed-20818a9e5d8e.json +++ b/data/helm_classic/meta/OPT-66B/48912a61-af54-4208-b36d-2f3a283e5c5d.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/meta_OPT-66B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/meta_OPT-66B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.448, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,12 +77,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -142,12 +154,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -235,12 +256,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -328,12 +358,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -496,12 +535,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -589,12 +637,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -657,12 +714,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -725,12 +791,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -793,12 +868,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,12 +1035,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1074,12 +1167,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1197,12 +1299,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1290,12 +1401,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1383,12 +1503,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/microsoft/TNLG-v2-530B/dd121d07-5198-4ac6-81d6-df38485bff25.json b/data/helm_classic/microsoft/TNLG-v2-530B/cc85315f-4472-4b22-9f0a-e4609676ce13.json similarity index 89% rename from data/helm_classic/microsoft/TNLG-v2-530B/dd121d07-5198-4ac6-81d6-df38485bff25.json rename to data/helm_classic/microsoft/TNLG-v2-530B/cc85315f-4472-4b22-9f0a-e4609676ce13.json index 786e640a5..ddcfa82ef 100644 --- a/data/helm_classic/microsoft/TNLG-v2-530B/dd121d07-5198-4ac6-81d6-df38485bff25.json +++ b/data/helm_classic/microsoft/TNLG-v2-530B/cc85315f-4472-4b22-9f0a-e4609676ce13.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/microsoft_TNLG-v2-530B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/microsoft_TNLG-v2-530B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.787, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,12 +77,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -142,12 +154,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -235,12 +256,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -328,12 +358,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -496,12 +535,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -589,12 +637,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -657,12 +714,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -725,12 +791,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -793,12 +868,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,12 +1035,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1074,12 +1167,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1197,12 +1299,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1290,12 +1401,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1383,12 +1503,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/microsoft/TNLG-v2-6.7B/f23680f4-8b5a-4baf-9e8d-74f0f4847183.json b/data/helm_classic/microsoft/TNLG-v2-6.7B/ab773619-db5e-449b-8d6b-da743cb038bb.json similarity index 89% rename from data/helm_classic/microsoft/TNLG-v2-6.7B/f23680f4-8b5a-4baf-9e8d-74f0f4847183.json rename to data/helm_classic/microsoft/TNLG-v2-6.7B/ab773619-db5e-449b-8d6b-da743cb038bb.json index ade6f8a0a..b3f527a04 100644 --- a/data/helm_classic/microsoft/TNLG-v2-6.7B/f23680f4-8b5a-4baf-9e8d-74f0f4847183.json +++ b/data/helm_classic/microsoft/TNLG-v2-6.7B/ab773619-db5e-449b-8d6b-da743cb038bb.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/microsoft_TNLG-v2-6.7B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/microsoft_TNLG-v2-6.7B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.309, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,12 +77,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -142,12 +154,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -235,12 +256,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -328,12 +358,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -496,12 +535,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -589,12 +637,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -657,12 +714,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -725,12 +791,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -793,12 +868,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,12 +1035,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1074,12 +1167,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1197,12 +1299,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1290,12 +1401,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1383,12 +1503,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/mistral-ai/Mistral-v0.1-7B/369d4026-1c0b-4e75-ad65-109dfb79978a.json b/data/helm_classic/mistralai/Mistral-v0.1-7B/5f5bde4c-aa06-41f2-abaf-67901f62a8a1.json similarity index 88% rename from data/helm_classic/mistral-ai/Mistral-v0.1-7B/369d4026-1c0b-4e75-ad65-109dfb79978a.json rename to data/helm_classic/mistralai/Mistral-v0.1-7B/5f5bde4c-aa06-41f2-abaf-67901f62a8a1.json index a4f716c06..1fd56a99f 100644 --- a/data/helm_classic/mistral-ai/Mistral-v0.1-7B/369d4026-1c0b-4e75-ad65-109dfb79978a.json +++ b/data/helm_classic/mistralai/Mistral-v0.1-7B/5f5bde4c-aa06-41f2-abaf-67901f62a8a1.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/mistral-ai_Mistral-v0.1-7B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/mistralai_Mistral-v0.1-7B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -13,13 +10,20 @@ }, "model_info": { "name": "Mistral v0.1 7B", - "id": "mistral-ai/Mistral-v0.1-7B", - "developer": "mistral-ai", + "id": "mistralai/Mistral-v0.1-7B", + "developer": "mistralai", "inference_platform": "unknown" }, "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.884, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,12 +77,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -142,12 +154,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -235,12 +256,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -328,12 +358,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -496,12 +535,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -589,12 +637,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -657,12 +714,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -725,12 +791,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -793,12 +868,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,12 +1035,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1074,12 +1167,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1197,12 +1299,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1290,12 +1401,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1383,12 +1503,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/mosaicml/MPT-30B/cd808be0-c4e5-4656-8bd2-ac6cd3f922e1.json b/data/helm_classic/mosaicml/MPT-30B/32cc2aa3-be26-41bd-8124-a8b1073c84c4.json similarity index 88% rename from data/helm_classic/mosaicml/MPT-30B/cd808be0-c4e5-4656-8bd2-ac6cd3f922e1.json rename to data/helm_classic/mosaicml/MPT-30B/32cc2aa3-be26-41bd-8124-a8b1073c84c4.json index bf414b629..b0d1817b0 100644 --- a/data/helm_classic/mosaicml/MPT-30B/cd808be0-c4e5-4656-8bd2-ac6cd3f922e1.json +++ b/data/helm_classic/mosaicml/MPT-30B/32cc2aa3-be26-41bd-8124-a8b1073c84c4.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/mosaicml_MPT-30B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/mosaicml_MPT-30B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.714, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,12 +77,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -142,12 +154,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -235,12 +256,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -328,12 +358,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -496,12 +535,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -589,12 +637,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -657,12 +714,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -725,12 +791,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -793,12 +868,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,12 +1035,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1074,12 +1167,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1197,12 +1299,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1290,12 +1401,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1383,12 +1503,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/mosaicml/MPT-Instruct-30B/182a7373-7ea3-4f2b-b730-af16e20b9fa7.json b/data/helm_classic/mosaicml/MPT-Instruct-30B/42a86a4a-7e76-4c7d-af48-e765a38df589.json similarity index 88% rename from data/helm_classic/mosaicml/MPT-Instruct-30B/182a7373-7ea3-4f2b-b730-af16e20b9fa7.json rename to data/helm_classic/mosaicml/MPT-Instruct-30B/42a86a4a-7e76-4c7d-af48-e765a38df589.json index dd4c71e77..771c4ac02 100644 --- a/data/helm_classic/mosaicml/MPT-Instruct-30B/182a7373-7ea3-4f2b-b730-af16e20b9fa7.json +++ b/data/helm_classic/mosaicml/MPT-Instruct-30B/42a86a4a-7e76-4c7d-af48-e765a38df589.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/mosaicml_MPT-Instruct-30B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/mosaicml_MPT-Instruct-30B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.716, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,12 +77,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -142,12 +154,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -235,12 +256,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -328,12 +358,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -496,12 +535,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -589,12 +637,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -657,12 +714,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -725,12 +791,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -793,12 +868,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,12 +1035,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1074,12 +1167,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1197,12 +1299,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1290,12 +1401,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1383,12 +1503,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/eleuther-ai/GPT-J-6B/8f3469ef-4b41-4452-b7be-f00059fb1920.json b/data/helm_classic/openai/GPT-J-6B/f9746ed1-887f-4850-ac2d-700de18acbaf.json similarity index 89% rename from data/helm_classic/eleuther-ai/GPT-J-6B/8f3469ef-4b41-4452-b7be-f00059fb1920.json rename to data/helm_classic/openai/GPT-J-6B/f9746ed1-887f-4850-ac2d-700de18acbaf.json index 64c16a070..20a0f0d63 100644 --- a/data/helm_classic/eleuther-ai/GPT-J-6B/8f3469ef-4b41-4452-b7be-f00059fb1920.json +++ b/data/helm_classic/openai/GPT-J-6B/f9746ed1-887f-4850-ac2d-700de18acbaf.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/eleuther-ai_GPT-J-6B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/openai_GPT-J-6B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -13,13 +10,20 @@ }, "model_info": { "name": "GPT-J 6B", - "id": "eleuther-ai/GPT-J-6B", - "developer": "eleuther-ai", + "id": "openai/GPT-J-6B", + "developer": "openai", "inference_platform": "unknown" }, "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.273, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,12 +77,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -142,12 +154,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -235,12 +256,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -328,12 +358,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -496,12 +535,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -589,12 +637,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -657,12 +714,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -725,12 +791,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -793,12 +868,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,12 +1035,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1074,12 +1167,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1197,12 +1299,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1290,12 +1401,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1383,12 +1503,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/eleuther-ai/GPT-NeoX-20B/82427784-0189-4aed-8e0e-42ea2435e27a.json b/data/helm_classic/openai/GPT-NeoX-20B/899521d0-e5eb-4e1b-af5a-78b3bd32e232.json similarity index 89% rename from data/helm_classic/eleuther-ai/GPT-NeoX-20B/82427784-0189-4aed-8e0e-42ea2435e27a.json rename to data/helm_classic/openai/GPT-NeoX-20B/899521d0-e5eb-4e1b-af5a-78b3bd32e232.json index b26d9ed28..0c00ea05c 100644 --- a/data/helm_classic/eleuther-ai/GPT-NeoX-20B/82427784-0189-4aed-8e0e-42ea2435e27a.json +++ b/data/helm_classic/openai/GPT-NeoX-20B/899521d0-e5eb-4e1b-af5a-78b3bd32e232.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/eleuther-ai_GPT-NeoX-20B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/openai_GPT-NeoX-20B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -13,13 +10,20 @@ }, "model_info": { "name": "GPT-NeoX 20B", - "id": "eleuther-ai/GPT-NeoX-20B", - "developer": "eleuther-ai", + "id": "openai/GPT-NeoX-20B", + "developer": "openai", "inference_platform": "unknown" }, "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.351, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,12 +77,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -142,12 +154,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -235,12 +256,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -328,12 +358,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -496,12 +535,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -589,12 +637,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -657,12 +714,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -725,12 +791,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -793,12 +868,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,12 +1035,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1074,12 +1167,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1197,12 +1299,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1290,12 +1401,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1383,12 +1503,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/openai/ada-350M/f7ef6c05-4d3c-475f-9217-fb3afa9cb752.json b/data/helm_classic/openai/ada-350M/1fb2c6db-2495-4609-a96b-57815c579953.json similarity index 93% rename from data/helm_classic/openai/ada-350M/f7ef6c05-4d3c-475f-9217-fb3afa9cb752.json rename to data/helm_classic/openai/ada-350M/1fb2c6db-2495-4609-a96b-57815c579953.json index ee84122f5..5355ce78b 100644 --- a/data/helm_classic/openai/ada-350M/f7ef6c05-4d3c-475f-9217-fb3afa9cb752.json +++ b/data/helm_classic/openai/ada-350M/1fb2c6db-2495-4609-a96b-57815c579953.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/openai_ada-350M/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/openai_ada-350M/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.108, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,12 +77,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -142,12 +154,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -235,12 +256,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -328,12 +358,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -496,12 +535,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -589,12 +637,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -657,12 +714,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -725,12 +791,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -793,12 +868,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,12 +1035,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1074,12 +1167,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1197,12 +1299,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1290,12 +1401,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1383,12 +1503,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/openai/babbage-1.3B/1c4a54f3-4599-441b-8f30-5e275a0597a7.json b/data/helm_classic/openai/babbage-1.3B/a5b6cc8b-676d-4c19-8093-0b893937e3d4.json similarity index 93% rename from data/helm_classic/openai/babbage-1.3B/1c4a54f3-4599-441b-8f30-5e275a0597a7.json rename to data/helm_classic/openai/babbage-1.3B/a5b6cc8b-676d-4c19-8093-0b893937e3d4.json index 3a55a8db1..d3977fc36 100644 --- a/data/helm_classic/openai/babbage-1.3B/1c4a54f3-4599-441b-8f30-5e275a0597a7.json +++ b/data/helm_classic/openai/babbage-1.3B/a5b6cc8b-676d-4c19-8093-0b893937e3d4.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/openai_babbage-1.3B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/openai_babbage-1.3B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.114, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,12 +77,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -142,12 +154,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -235,12 +256,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -328,12 +358,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -496,12 +535,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -589,12 +637,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -657,12 +714,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -725,12 +791,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -793,12 +868,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,12 +1035,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1074,12 +1167,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1197,12 +1299,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1290,12 +1401,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1383,12 +1503,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/openai/curie-6.7B/dbefbdbd-b64e-40e9-b632-0dcae3f33913.json b/data/helm_classic/openai/curie-6.7B/0d4d42b2-d90c-418a-b3e3-c2d59453bacf.json similarity index 93% rename from data/helm_classic/openai/curie-6.7B/dbefbdbd-b64e-40e9-b632-0dcae3f33913.json rename to data/helm_classic/openai/curie-6.7B/0d4d42b2-d90c-418a-b3e3-c2d59453bacf.json index d7959e7bb..fe011ca06 100644 --- a/data/helm_classic/openai/curie-6.7B/dbefbdbd-b64e-40e9-b632-0dcae3f33913.json +++ b/data/helm_classic/openai/curie-6.7B/0d4d42b2-d90c-418a-b3e3-c2d59453bacf.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/openai_curie-6.7B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/openai_curie-6.7B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.247, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,12 +77,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -142,12 +154,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -235,12 +256,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -328,12 +358,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -496,12 +535,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -589,12 +637,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -657,12 +714,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -725,12 +791,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -793,12 +868,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,12 +1035,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1074,12 +1167,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1197,12 +1299,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1290,12 +1401,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1383,12 +1503,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/openai/davinci-175B/f49bd5aa-bb27-43cf-a0f3-3aa4c7ed0b3e.json b/data/helm_classic/openai/davinci-175B/bc207557-fb49-4a87-8401-22c3ce853e7c.json similarity index 93% rename from data/helm_classic/openai/davinci-175B/f49bd5aa-bb27-43cf-a0f3-3aa4c7ed0b3e.json rename to data/helm_classic/openai/davinci-175B/bc207557-fb49-4a87-8401-22c3ce853e7c.json index 6b30fefef..b376d2873 100644 --- a/data/helm_classic/openai/davinci-175B/f49bd5aa-bb27-43cf-a0f3-3aa4c7ed0b3e.json +++ b/data/helm_classic/openai/davinci-175B/bc207557-fb49-4a87-8401-22c3ce853e7c.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/openai_davinci-175B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/openai_davinci-175B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.538, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,12 +77,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -142,12 +154,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -235,12 +256,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -328,12 +358,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -496,12 +535,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -589,12 +637,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -657,12 +714,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -725,12 +791,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -793,12 +868,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,12 +1035,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1074,12 +1167,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1197,12 +1299,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1290,12 +1401,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1383,12 +1503,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/openai/gpt-3.5-turbo-0301/2e918ebc-fbd6-4bbe-8604-e759cf5d4473.json b/data/helm_classic/openai/gpt-3.5-turbo-0301/895266ee-71a5-4ca5-b3f9-62df6383ff95.json similarity index 88% rename from data/helm_classic/openai/gpt-3.5-turbo-0301/2e918ebc-fbd6-4bbe-8604-e759cf5d4473.json rename to data/helm_classic/openai/gpt-3.5-turbo-0301/895266ee-71a5-4ca5-b3f9-62df6383ff95.json index 435cb040d..8051b9b3e 100644 --- a/data/helm_classic/openai/gpt-3.5-turbo-0301/2e918ebc-fbd6-4bbe-8604-e759cf5d4473.json +++ b/data/helm_classic/openai/gpt-3.5-turbo-0301/895266ee-71a5-4ca5-b3f9-62df6383ff95.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/openai_gpt-3.5-turbo-0301/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/openai_gpt-3.5-turbo-0301/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.76, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,12 +77,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -142,12 +154,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -235,12 +256,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -328,12 +358,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -496,12 +535,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -589,12 +637,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -657,12 +714,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -725,12 +791,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -793,12 +868,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,12 +1035,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1074,12 +1167,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1197,12 +1299,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1290,12 +1401,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1383,12 +1503,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/openai/gpt-3.5-turbo-0613/826d8e72-7332-48b1-af41-537e505c9e11.json b/data/helm_classic/openai/gpt-3.5-turbo-0613/8828e9e8-5716-41b4-a2d1-233bb056dc32.json similarity index 88% rename from data/helm_classic/openai/gpt-3.5-turbo-0613/826d8e72-7332-48b1-af41-537e505c9e11.json rename to data/helm_classic/openai/gpt-3.5-turbo-0613/8828e9e8-5716-41b4-a2d1-233bb056dc32.json index bf7553bf6..b2682e6f7 100644 --- a/data/helm_classic/openai/gpt-3.5-turbo-0613/826d8e72-7332-48b1-af41-537e505c9e11.json +++ b/data/helm_classic/openai/gpt-3.5-turbo-0613/8828e9e8-5716-41b4-a2d1-233bb056dc32.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/openai_gpt-3.5-turbo-0613/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/openai_gpt-3.5-turbo-0613/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.783, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,12 +77,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -142,12 +154,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -235,12 +256,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -328,12 +358,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -496,12 +535,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -589,12 +637,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -657,12 +714,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -725,12 +791,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -793,12 +868,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,12 +1035,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1074,12 +1167,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1197,12 +1299,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1290,12 +1401,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1383,12 +1503,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/openai/text-ada-001/c34ec087-f3a1-49f1-8ff7-79f353171c4c.json b/data/helm_classic/openai/text-ada-001/f267ba72-b239-4126-99c5-675f79b1ae95.json similarity index 93% rename from data/helm_classic/openai/text-ada-001/c34ec087-f3a1-49f1-8ff7-79f353171c4c.json rename to data/helm_classic/openai/text-ada-001/f267ba72-b239-4126-99c5-675f79b1ae95.json index d1a92ef67..43f728bf2 100644 --- a/data/helm_classic/openai/text-ada-001/c34ec087-f3a1-49f1-8ff7-79f353171c4c.json +++ b/data/helm_classic/openai/text-ada-001/f267ba72-b239-4126-99c5-675f79b1ae95.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/openai_text-ada-001/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/openai_text-ada-001/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.107, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,12 +77,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -142,12 +154,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -235,12 +256,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -328,12 +358,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -496,12 +535,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -589,12 +637,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -657,12 +714,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -725,12 +791,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -793,12 +868,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,12 +1035,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1074,12 +1167,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1197,12 +1299,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1290,12 +1401,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1383,12 +1503,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/openai/text-babbage-001/09763c40-c365-4be9-befc-970ce1886641.json b/data/helm_classic/openai/text-babbage-001/f386e763-8078-454b-bd14-32b106663d53.json similarity index 93% rename from data/helm_classic/openai/text-babbage-001/09763c40-c365-4be9-befc-970ce1886641.json rename to data/helm_classic/openai/text-babbage-001/f386e763-8078-454b-bd14-32b106663d53.json index fb51f6a42..fbb4b5bb6 100644 --- a/data/helm_classic/openai/text-babbage-001/09763c40-c365-4be9-befc-970ce1886641.json +++ b/data/helm_classic/openai/text-babbage-001/f386e763-8078-454b-bd14-32b106663d53.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/openai_text-babbage-001/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/openai_text-babbage-001/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.229, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,12 +77,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -142,12 +154,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -235,12 +256,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -328,12 +358,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -496,12 +535,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -589,12 +637,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -657,12 +714,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -725,12 +791,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -793,12 +868,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,12 +1035,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1074,12 +1167,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1197,12 +1299,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1290,12 +1401,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1383,12 +1503,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/openai/text-curie-001/4ece7c38-114a-4973-ba13-ac3821c9836f.json b/data/helm_classic/openai/text-curie-001/a4739cda-028b-48e0-b3b5-ca9b583d03f5.json similarity index 93% rename from data/helm_classic/openai/text-curie-001/4ece7c38-114a-4973-ba13-ac3821c9836f.json rename to data/helm_classic/openai/text-curie-001/a4739cda-028b-48e0-b3b5-ca9b583d03f5.json index bb4d6e7ff..4537bcc84 100644 --- a/data/helm_classic/openai/text-curie-001/4ece7c38-114a-4973-ba13-ac3821c9836f.json +++ b/data/helm_classic/openai/text-curie-001/a4739cda-028b-48e0-b3b5-ca9b583d03f5.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/openai_text-curie-001/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/openai_text-curie-001/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.36, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,12 +77,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -142,12 +154,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -235,12 +256,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -328,12 +358,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -496,12 +535,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -589,12 +637,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -657,12 +714,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -725,12 +791,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -793,12 +868,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,12 +1035,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1074,12 +1167,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1197,12 +1299,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1290,12 +1401,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1383,12 +1503,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/openai/text-davinci-002/75b2178d-8f0d-4b4c-b31c-752f0cdeb522.json b/data/helm_classic/openai/text-davinci-002/837e20ff-fed1-4431-b643-63b904055c66.json similarity index 93% rename from data/helm_classic/openai/text-davinci-002/75b2178d-8f0d-4b4c-b31c-752f0cdeb522.json rename to data/helm_classic/openai/text-davinci-002/837e20ff-fed1-4431-b643-63b904055c66.json index 4d9b820e6..0e9fa4947 100644 --- a/data/helm_classic/openai/text-davinci-002/75b2178d-8f0d-4b4c-b31c-752f0cdeb522.json +++ b/data/helm_classic/openai/text-davinci-002/837e20ff-fed1-4431-b643-63b904055c66.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/openai_text-davinci-002/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/openai_text-davinci-002/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.905, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,12 +77,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -142,12 +154,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -235,12 +256,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -328,12 +358,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -496,12 +535,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -589,12 +637,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -657,12 +714,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -725,12 +791,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -793,12 +868,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,12 +1035,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1074,12 +1167,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1197,12 +1299,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1290,12 +1401,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1383,12 +1503,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/openai/text-davinci-003/0c43aeaf-c7d3-4e00-8b84-5115a6396585.json b/data/helm_classic/openai/text-davinci-003/e411f017-22c6-4d49-9bf9-5d99c1091791.json similarity index 92% rename from data/helm_classic/openai/text-davinci-003/0c43aeaf-c7d3-4e00-8b84-5115a6396585.json rename to data/helm_classic/openai/text-davinci-003/e411f017-22c6-4d49-9bf9-5d99c1091791.json index 437247369..9ca831c0f 100644 --- a/data/helm_classic/openai/text-davinci-003/0c43aeaf-c7d3-4e00-8b84-5115a6396585.json +++ b/data/helm_classic/openai/text-davinci-003/e411f017-22c6-4d49-9bf9-5d99c1091791.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/openai_text-davinci-003/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/openai_text-davinci-003/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.872, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,12 +77,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -142,12 +154,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -235,12 +256,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -328,12 +358,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -496,12 +535,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -589,12 +637,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -657,12 +714,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -725,12 +791,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -793,12 +868,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,12 +1035,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1074,12 +1167,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1197,12 +1299,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1290,12 +1401,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1383,12 +1503,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/stanford/Alpaca-7B/d25691b8-37e7-42ff-b59a-8684197280f1.json b/data/helm_classic/stanford/Alpaca-7B/7bd2b266-5a65-4c63-bf18-5e4114564bfc.json similarity index 88% rename from data/helm_classic/stanford/Alpaca-7B/d25691b8-37e7-42ff-b59a-8684197280f1.json rename to data/helm_classic/stanford/Alpaca-7B/7bd2b266-5a65-4c63-bf18-5e4114564bfc.json index 24ce27c0b..cf2a4b297 100644 --- a/data/helm_classic/stanford/Alpaca-7B/d25691b8-37e7-42ff-b59a-8684197280f1.json +++ b/data/helm_classic/stanford/Alpaca-7B/7bd2b266-5a65-4c63-bf18-5e4114564bfc.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/stanford_Alpaca-7B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/stanford_Alpaca-7B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.381, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,12 +77,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -142,12 +154,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -235,12 +256,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -328,12 +358,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -496,12 +535,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -589,12 +637,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -657,12 +714,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -725,12 +791,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -793,12 +868,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,12 +1035,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1074,12 +1167,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1197,12 +1299,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1290,12 +1401,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1383,12 +1503,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/tii-uae/Falcon-40B/da3f6768-fa98-4aff-bf8a-db910edeabb2.json b/data/helm_classic/tiiuae/Falcon-40B/49a1423e-d5f4-4665-b81e-d491f492a316.json similarity index 88% rename from data/helm_classic/tii-uae/Falcon-40B/da3f6768-fa98-4aff-bf8a-db910edeabb2.json rename to data/helm_classic/tiiuae/Falcon-40B/49a1423e-d5f4-4665-b81e-d491f492a316.json index a08e1b6ca..97f13c6d9 100644 --- a/data/helm_classic/tii-uae/Falcon-40B/da3f6768-fa98-4aff-bf8a-db910edeabb2.json +++ b/data/helm_classic/tiiuae/Falcon-40B/49a1423e-d5f4-4665-b81e-d491f492a316.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/tii-uae_Falcon-40B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/tiiuae_Falcon-40B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -13,13 +10,20 @@ }, "model_info": { "name": "Falcon 40B", - "id": "tii-uae/Falcon-40B", - "developer": "tii-uae", + "id": "tiiuae/Falcon-40B", + "developer": "tiiuae", "inference_platform": "unknown" }, "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.729, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,12 +77,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -142,12 +154,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -235,12 +256,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -328,12 +358,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -496,12 +535,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -589,12 +637,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -657,12 +714,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -725,12 +791,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -793,12 +868,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,12 +1035,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1074,12 +1167,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1197,12 +1299,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1290,12 +1401,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1383,12 +1503,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/tii-uae/Falcon-7B/9f6dda65-e6e4-4a05-bdb5-ec91784600ff.json b/data/helm_classic/tiiuae/Falcon-7B/8ec2c3d9-c84e-4742-a760-2d33ddf47eab.json similarity index 88% rename from data/helm_classic/tii-uae/Falcon-7B/9f6dda65-e6e4-4a05-bdb5-ec91784600ff.json rename to data/helm_classic/tiiuae/Falcon-7B/8ec2c3d9-c84e-4742-a760-2d33ddf47eab.json index 0911bfafa..80c0ac18a 100644 --- a/data/helm_classic/tii-uae/Falcon-7B/9f6dda65-e6e4-4a05-bdb5-ec91784600ff.json +++ b/data/helm_classic/tiiuae/Falcon-7B/8ec2c3d9-c84e-4742-a760-2d33ddf47eab.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/tii-uae_Falcon-7B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/tiiuae_Falcon-7B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -13,13 +10,20 @@ }, "model_info": { "name": "Falcon 7B", - "id": "tii-uae/Falcon-7B", - "developer": "tii-uae", + "id": "tiiuae/Falcon-7B", + "developer": "tiiuae", "inference_platform": "unknown" }, "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.378, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,12 +77,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -142,12 +154,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -235,12 +256,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -328,12 +358,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -496,12 +535,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -589,12 +637,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -657,12 +714,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -725,12 +791,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -793,12 +868,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,12 +1035,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1074,12 +1167,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1197,12 +1299,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1290,12 +1401,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1383,12 +1503,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/tii-uae/Falcon-Instruct-40B/f936c641-4a7b-4d78-899f-e26256570592.json b/data/helm_classic/tiiuae/Falcon-Instruct-40B/ec39cb88-fbd3-4cfb-9a11-571ef43e193e.json similarity index 88% rename from data/helm_classic/tii-uae/Falcon-Instruct-40B/f936c641-4a7b-4d78-899f-e26256570592.json rename to data/helm_classic/tiiuae/Falcon-Instruct-40B/ec39cb88-fbd3-4cfb-9a11-571ef43e193e.json index 99345e7ef..4b7c6b681 100644 --- a/data/helm_classic/tii-uae/Falcon-Instruct-40B/f936c641-4a7b-4d78-899f-e26256570592.json +++ b/data/helm_classic/tiiuae/Falcon-Instruct-40B/ec39cb88-fbd3-4cfb-9a11-571ef43e193e.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/tii-uae_Falcon-Instruct-40B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/tiiuae_Falcon-Instruct-40B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -13,13 +10,20 @@ }, "model_info": { "name": "Falcon-Instruct 40B", - "id": "tii-uae/Falcon-Instruct-40B", - "developer": "tii-uae", + "id": "tiiuae/Falcon-Instruct-40B", + "developer": "tiiuae", "inference_platform": "unknown" }, "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.727, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,12 +77,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -142,12 +154,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -235,12 +256,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -328,12 +358,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -496,12 +535,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -589,12 +637,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -657,12 +714,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -725,12 +791,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -793,12 +868,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,12 +1035,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1074,12 +1167,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1197,12 +1299,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1290,12 +1401,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1383,12 +1503,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/tii-uae/Falcon-Instruct-7B/7f04feb5-92b5-4d6c-96c6-7f66bfc88e96.json b/data/helm_classic/tiiuae/Falcon-Instruct-7B/a2b4ed40-b04f-481f-986b-25a2c26bbb79.json similarity index 88% rename from data/helm_classic/tii-uae/Falcon-Instruct-7B/7f04feb5-92b5-4d6c-96c6-7f66bfc88e96.json rename to data/helm_classic/tiiuae/Falcon-Instruct-7B/a2b4ed40-b04f-481f-986b-25a2c26bbb79.json index b0b75c2b1..cd7efa818 100644 --- a/data/helm_classic/tii-uae/Falcon-Instruct-7B/7f04feb5-92b5-4d6c-96c6-7f66bfc88e96.json +++ b/data/helm_classic/tiiuae/Falcon-Instruct-7B/a2b4ed40-b04f-481f-986b-25a2c26bbb79.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/tii-uae_Falcon-Instruct-7B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/tiiuae_Falcon-Instruct-7B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -13,13 +10,20 @@ }, "model_info": { "name": "Falcon-Instruct 7B", - "id": "tii-uae/Falcon-Instruct-7B", - "developer": "tii-uae", + "id": "tiiuae/Falcon-Instruct-7B", + "developer": "tiiuae", "inference_platform": "unknown" }, "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.244, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,12 +77,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -142,12 +154,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -235,12 +256,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -328,12 +358,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -496,12 +535,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -589,12 +637,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -657,12 +714,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -725,12 +791,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -793,12 +868,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,12 +1035,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1074,12 +1167,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1197,12 +1299,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1290,12 +1401,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1383,12 +1503,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/together/RedPajama-INCITE-Base-7B/8db87a70-babc-4776-8317-70752d3c5546.json b/data/helm_classic/together/RedPajama-INCITE-Base-7B/e88f9163-5334-43ed-9b56-154bf543f898.json similarity index 88% rename from data/helm_classic/together/RedPajama-INCITE-Base-7B/8db87a70-babc-4776-8317-70752d3c5546.json rename to data/helm_classic/together/RedPajama-INCITE-Base-7B/e88f9163-5334-43ed-9b56-154bf543f898.json index 66ae49567..f25c83f2e 100644 --- a/data/helm_classic/together/RedPajama-INCITE-Base-7B/8db87a70-babc-4776-8317-70752d3c5546.json +++ b/data/helm_classic/together/RedPajama-INCITE-Base-7B/e88f9163-5334-43ed-9b56-154bf543f898.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/together_RedPajama-INCITE-Base-7B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/together_RedPajama-INCITE-Base-7B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.378, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,12 +77,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -142,12 +154,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -235,12 +256,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -328,12 +358,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -496,12 +535,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -589,12 +637,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -657,12 +714,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -725,12 +791,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -793,12 +868,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,12 +1035,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1074,12 +1167,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1197,12 +1299,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1290,12 +1401,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1383,12 +1503,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/together/RedPajama-INCITE-Base-v1-3B/3da308fb-2403-432e-bde3-3b14af627552.json b/data/helm_classic/together/RedPajama-INCITE-Base-v1-3B/6d436bd5-9d49-4895-8c07-7814b2eef12c.json similarity index 88% rename from data/helm_classic/together/RedPajama-INCITE-Base-v1-3B/3da308fb-2403-432e-bde3-3b14af627552.json rename to data/helm_classic/together/RedPajama-INCITE-Base-v1-3B/6d436bd5-9d49-4895-8c07-7814b2eef12c.json index f09058f3c..d4d85552c 100644 --- a/data/helm_classic/together/RedPajama-INCITE-Base-v1-3B/3da308fb-2403-432e-bde3-3b14af627552.json +++ b/data/helm_classic/together/RedPajama-INCITE-Base-v1-3B/6d436bd5-9d49-4895-8c07-7814b2eef12c.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/together_RedPajama-INCITE-Base-v1-3B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/together_RedPajama-INCITE-Base-v1-3B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.311, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,12 +77,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -142,12 +154,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -235,12 +256,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -328,12 +358,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -496,12 +535,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -589,12 +637,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -657,12 +714,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -725,12 +791,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -793,12 +868,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,12 +1035,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1074,12 +1167,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1197,12 +1299,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1290,12 +1401,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1383,12 +1503,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/together/RedPajama-INCITE-Instruct-7B/fd8f7b08-813c-4369-bfe4-d86eacc874ea.json b/data/helm_classic/together/RedPajama-INCITE-Instruct-7B/681d0d6d-de06-4b8e-a7e2-964d98e2806e.json similarity index 88% rename from data/helm_classic/together/RedPajama-INCITE-Instruct-7B/fd8f7b08-813c-4369-bfe4-d86eacc874ea.json rename to data/helm_classic/together/RedPajama-INCITE-Instruct-7B/681d0d6d-de06-4b8e-a7e2-964d98e2806e.json index 9ed3b7bf9..9d60f7506 100644 --- a/data/helm_classic/together/RedPajama-INCITE-Instruct-7B/fd8f7b08-813c-4369-bfe4-d86eacc874ea.json +++ b/data/helm_classic/together/RedPajama-INCITE-Instruct-7B/681d0d6d-de06-4b8e-a7e2-964d98e2806e.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/together_RedPajama-INCITE-Instruct-7B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/together_RedPajama-INCITE-Instruct-7B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.524, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,12 +77,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -142,12 +154,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -235,12 +256,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -328,12 +358,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -496,12 +535,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -589,12 +637,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -657,12 +714,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -725,12 +791,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -793,12 +868,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,12 +1035,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1074,12 +1167,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1197,12 +1299,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1290,12 +1401,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1383,12 +1503,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/together/RedPajama-INCITE-Instruct-v1-3B/e0cf4bca-e6c6-4eb4-81b2-19c88d0ddd21.json b/data/helm_classic/together/RedPajama-INCITE-Instruct-v1-3B/e79e0c17-2e9b-4b99-85e4-7f15e1a337f7.json similarity index 88% rename from data/helm_classic/together/RedPajama-INCITE-Instruct-v1-3B/e0cf4bca-e6c6-4eb4-81b2-19c88d0ddd21.json rename to data/helm_classic/together/RedPajama-INCITE-Instruct-v1-3B/e79e0c17-2e9b-4b99-85e4-7f15e1a337f7.json index bb56f1198..57ffafd39 100644 --- a/data/helm_classic/together/RedPajama-INCITE-Instruct-v1-3B/e0cf4bca-e6c6-4eb4-81b2-19c88d0ddd21.json +++ b/data/helm_classic/together/RedPajama-INCITE-Instruct-v1-3B/e79e0c17-2e9b-4b99-85e4-7f15e1a337f7.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/together_RedPajama-INCITE-Instruct-v1-3B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/together_RedPajama-INCITE-Instruct-v1-3B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.366, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,12 +77,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -142,12 +154,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -235,12 +256,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -328,12 +358,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -496,12 +535,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -589,12 +637,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -657,12 +714,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -725,12 +791,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -793,12 +868,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,12 +1035,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1074,12 +1167,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1197,12 +1299,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1290,12 +1401,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1383,12 +1503,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/writer/InstructPalmyra-30B/bcf54365-b229-4abf-8ff8-59b4b46fa829.json b/data/helm_classic/writer/InstructPalmyra-30B/cb80bd5f-204a-4dd8-96ec-40c7df93975f.json similarity index 89% rename from data/helm_classic/writer/InstructPalmyra-30B/bcf54365-b229-4abf-8ff8-59b4b46fa829.json rename to data/helm_classic/writer/InstructPalmyra-30B/cb80bd5f-204a-4dd8-96ec-40c7df93975f.json index add4859be..fe1ab40e2 100644 --- a/data/helm_classic/writer/InstructPalmyra-30B/bcf54365-b229-4abf-8ff8-59b4b46fa829.json +++ b/data/helm_classic/writer/InstructPalmyra-30B/cb80bd5f-204a-4dd8-96ec-40c7df93975f.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/writer_InstructPalmyra-30B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/writer_InstructPalmyra-30B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.568, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,12 +77,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -142,12 +154,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -235,12 +256,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -328,12 +358,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -496,12 +535,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -589,12 +637,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -657,12 +714,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -725,12 +791,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -793,12 +868,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,12 +1035,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1074,12 +1167,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1197,12 +1299,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1290,12 +1401,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1383,12 +1503,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/yandex/YaLM-100B/eae6f2a0-c13a-471a-82e9-03f331b1dbe0.json b/data/helm_classic/yandex/YaLM-100B/f84f84a8-7191-42ac-8951-5d7141a0f700.json similarity index 89% rename from data/helm_classic/yandex/YaLM-100B/eae6f2a0-c13a-471a-82e9-03f331b1dbe0.json rename to data/helm_classic/yandex/YaLM-100B/f84f84a8-7191-42ac-8951-5d7141a0f700.json index 74662144a..61a019ad2 100644 --- a/data/helm_classic/yandex/YaLM-100B/eae6f2a0-c13a-471a-82e9-03f331b1dbe0.json +++ b/data/helm_classic/yandex/YaLM-100B/f84f84a8-7191-42ac-8951-5d7141a0f700.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/yandex_YaLM-100B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/yandex_YaLM-100B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.075, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,12 +77,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -142,12 +154,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -235,12 +256,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -328,12 +358,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -496,12 +535,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -589,12 +637,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -657,12 +714,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -725,12 +791,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -793,12 +868,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,12 +1035,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1074,12 +1167,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1197,12 +1299,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1290,12 +1401,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1383,12 +1503,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_classic/zhipu-ai/GLM-130B/f45719e5-3334-4e1d-8a83-f5f8292cb977.json b/data/helm_classic/zhipu-ai/GLM-130B/9ba74767-b675-460a-bb68-e82adb6acd2f.json similarity index 89% rename from data/helm_classic/zhipu-ai/GLM-130B/f45719e5-3334-4e1d-8a83-f5f8292cb977.json rename to data/helm_classic/zhipu-ai/GLM-130B/9ba74767-b675-460a-bb68-e82adb6acd2f.json index 2f17c575d..04bdfa490 100644 --- a/data/helm_classic/zhipu-ai/GLM-130B/f45719e5-3334-4e1d-8a83-f5f8292cb977.json +++ b/data/helm_classic/zhipu-ai/GLM-130B/9ba74767-b675-460a-bb68-e82adb6acd2f.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_classic/zhipu-ai_GLM-130B/1768090731.5328572", - "retrieved_timestamp": "1768090731.5328572", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_classic/zhipu-ai_GLM-130B/1770834891.1472661", + "retrieved_timestamp": "1770834891.1472661", "source_metadata": { "source_name": "helm_classic", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_classic", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperform on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.512, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Calibration": { "description": null, @@ -74,12 +77,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://openreview.net/forum?id=d7KBjmI3GmQ).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -142,12 +154,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "BoolQ - EM", + "evaluation_name": "BoolQ", + "source_data": { + "dataset_name": "BoolQ", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The BoolQ benchmark for binary (yes/no) question answering [(Clark et al., 2019)](https://aclanthology.org/N19-1300/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on BoolQ", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -235,12 +256,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -328,12 +358,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (open-book) - F1", + "evaluation_name": "NaturalQuestions (open-book)", + "source_data": { + "dataset_name": "NaturalQuestions (open-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (open-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -496,12 +535,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "QuAC - F1", + "evaluation_name": "QuAC", + "source_data": { + "dataset_name": "QuAC", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The QuAC benchmark for question answering in the context of dialogues [(Choi et al., 2018)](https://aclanthology.org/D18-1241/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on QuAC", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -589,12 +637,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "HellaSwag - EM", + "evaluation_name": "HellaSwag", + "source_data": { + "dataset_name": "HellaSwag", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The HellaSwag benchmark for commonsense reasoning in question answering [(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on HellaSwag", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -657,12 +714,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -725,12 +791,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "TruthfulQA - EM", + "evaluation_name": "TruthfulQA", + "source_data": { + "dataset_name": "TruthfulQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The TruthfulQA benchmarking for measuring model truthfulness and commonsense knowledge in question answering [(Lin et al., 2022)](https://aclanthology.org/2022.acl-long.229/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on TruthfulQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -793,12 +868,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "MS MARCO (TREC) - NDCG@10", + "evaluation_name": "MS MARCO (TREC)", + "source_data": { + "dataset_name": "MS MARCO (TREC)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MS MARCO benchmark's deep learning TREC track for passage retrieval in information retrieval [(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).\n\nNDCG@10: Normalized discounted cumulative gain at 10 in information retrieval.", + "evaluation_description": "NDCG@10 on MS MARCO (TREC)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,12 +1035,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CNN/DailyMail - ROUGE-2", + "evaluation_name": "CNN/DailyMail", + "source_data": { + "dataset_name": "CNN/DailyMail", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CNN/DailyMail benchmark for text summarization ([Hermann et al., 2015](https://papers.nips.cc/paper/2015/hash/afdec7005cc9f14302cd0474fd0f3c96-Abstract.html); [Nallapati et al.,2016](https://aclanthology.org/K16-1028/)).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on CNN/DailyMail", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1074,12 +1167,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "XSUM - ROUGE-2", + "evaluation_name": "XSUM", + "source_data": { + "dataset_name": "XSUM", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The XSUM benchmark for text summarization of BBC news articles [(Narayan et al., 2018)](https://aclanthology.org/D18-1206/).\n\nROUGE-2: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.", + "evaluation_description": "ROUGE-2 on XSUM", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1197,12 +1299,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "IMDB - EM", + "evaluation_name": "IMDB", + "source_data": { + "dataset_name": "IMDB", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The IMDB benchmark for sentiment analysis in movie review [(Maas et al., 2011)](https://aclanthology.org/P11-1015/).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on IMDB", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1290,12 +1401,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "CivilComments - EM", + "evaluation_name": "CivilComments", + "source_data": { + "dataset_name": "CivilComments", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The CivilComments benchmark for toxicity detection [(Borkan et al., 2019)](https://arxiv.org/pdf/1903.04561.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on CivilComments", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1383,12 +1503,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "RAFT - EM", + "evaluation_name": "RAFT", + "source_data": { + "dataset_name": "RAFT", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text classification tasks [(Alex et al., 2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on RAFT", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1476,7 +1605,9 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_instruct/anthropic/claude-v1.3/c4e55239-581b-433f-82bc-68a690f59e4a.json b/data/helm_instruct/anthropic/claude-v1.3/c4e55239-581b-433f-82bc-68a690f59e4a.json deleted file mode 100644 index 841d52f14..000000000 --- a/data/helm_instruct/anthropic/claude-v1.3/c4e55239-581b-433f-82bc-68a690f59e4a.json +++ /dev/null @@ -1,208 +0,0 @@ -{ - "schema_version": "0.1.0", - "evaluation_id": "helm_instruct/anthropic_claude-v1.3/1768085895.632564", - "retrieved_timestamp": "1768085895.632564", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" - ], - "source_metadata": { - "source_name": "helm_instruct", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Anthropic Claude v1.3", - "id": "anthropic/claude-v1.3", - "developer": "anthropic", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "metric_config": { - "evaluation_description": "How many models this model outperform on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.611, - "details": { - "description": null, - "tab": "Instruction Following" - } - }, - "generation_config": {} - }, - { - "evaluation_name": "Anthropic RLHF dataset - Harmlessness", - "metric_config": { - "evaluation_description": "The dialogue datasets released by Anthropic to facilitate research in model helpfulness and harmlessness ([Bai et al., 2022](https://arxiv.org/pdf/2204.05862.pdf); [Ganguli et al., 2022](https://arxiv.org/pdf/2209.07858.pdf)). We only use the first utterance of each dialogue.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 5.0 - }, - "score_details": { - "score": 4.965, - "details": { - "description": "min=4.925, mean=4.965, max=5, sum=39.72 (8)", - "tab": "Instruction Following" - } - }, - "generation_config": { - "subset": [ - "hh", - "hh", - "hh", - "hh", - "red_team", - "red_team", - "red_team", - "red_team" - ], - "evaluator": [ - "claude", - "gpt4", - "mturk", - "scale", - "claude", - "gpt4", - "mturk", - "scale" - ] - } - }, - { - "evaluation_name": "Best ChatGPT Prompts - Harmlessness", - "metric_config": { - "evaluation_description": "A list of “best ChatGPT prompts to power your workflow” summarized by [GRIDFITI](https://gridfiti.com/best-chatgpt-prompts/).\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 5.0 - }, - "score_details": { - "score": 4.995, - "details": { - "description": "min=4.985, mean=4.995, max=5, sum=19.98 (4)", - "tab": "Instruction Following" - } - }, - "generation_config": { - "path": "src_helm_benchmark_scenarios_best_chatgpt_prompts.yaml", - "tags": "", - "evaluator": [ - "claude", - "gpt4", - "mturk", - "scale" - ] - } - }, - { - "evaluation_name": "Koala test dataset - Harmlessness", - "metric_config": { - "evaluation_description": "The test dataset from the [Koala paper](https://bair.berkeley.edu/blog/2023/04/03/koala/) for evaluating instruction-following models.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 5.0 - }, - "score_details": { - "score": 4.981, - "details": { - "description": "min=4.965, mean=4.981, max=5, sum=19.925 (4)", - "tab": "Instruction Following" - } - }, - "generation_config": { - "evaluator": [ - "claude", - "gpt4", - "mturk", - "scale" - ] - } - }, - { - "evaluation_name": "Open Assistant - Harmlessness", - "metric_config": { - "evaluation_description": "LAION’s OpenAssistant Conversations Dataset (OASST1) that consists of 66,497 conversation trees ([Köpf et al., 2023](https://openreview.net/forum?id=VSJotgbPHF)). We only use the initial prompt in each conversation.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 5.0 - }, - "score_details": { - "score": 4.975, - "details": { - "description": "min=4.935, mean=4.975, max=5, sum=19.9 (4)", - "tab": "Instruction Following" - } - }, - "generation_config": { - "language": "en", - "evaluator": [ - "claude", - "gpt4", - "mturk", - "scale" - ] - } - }, - { - "evaluation_name": "Self Instruct - Harmlessness", - "metric_config": { - "evaluation_description": "The manually-curated instructions from the Self-Instruct paper ([Wang et al., 2023](https://aclanthology.org/2023.acl-long.754.pdf)).\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 5.0 - }, - "score_details": { - "score": 4.992, - "details": { - "description": "min=4.98, mean=4.992, max=5, sum=19.97 (4)", - "tab": "Instruction Following" - } - }, - "generation_config": { - "evaluator": [ - "claude", - "gpt4", - "mturk", - "scale" - ] - } - }, - { - "evaluation_name": "Vicuna - Harmlessness", - "metric_config": { - "evaluation_description": "The set of prompts used by the [Vicuna](https://lmsys.org/blog/2023-03-30-vicuna/) team to evaluate instruction-following models.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 5.0 - }, - "score_details": { - "score": 4.989, - "details": { - "description": "min=4.956, mean=4.989, max=5, sum=19.956 (4)", - "tab": "Instruction Following" - } - }, - "generation_config": { - "category": "all", - "evaluator": [ - "claude", - "gpt4", - "mturk", - "scale" - ] - } - } - ] -} \ No newline at end of file diff --git a/data/helm_instruct/anthropic/claude-v1.3/e5e98ffa-3c2d-42d4-86a9-0cb46a71c684.json b/data/helm_instruct/anthropic/claude-v1.3/e5e98ffa-3c2d-42d4-86a9-0cb46a71c684.json new file mode 100644 index 000000000..31ab229b7 --- /dev/null +++ b/data/helm_instruct/anthropic/claude-v1.3/e5e98ffa-3c2d-42d4-86a9-0cb46a71c684.json @@ -0,0 +1,267 @@ +{ + "schema_version": "0.2.0", + "evaluation_id": "helm_instruct/anthropic_claude-v1.3/1770834858.3559701", + "retrieved_timestamp": "1770834858.3559701", + "source_metadata": { + "source_name": "helm_instruct", + "source_type": "documentation", + "source_organization_name": "crfm", + "evaluator_relationship": "third_party" + }, + "model_info": { + "name": "Anthropic Claude v1.3", + "id": "anthropic/claude-v1.3", + "developer": "anthropic", + "inference_platform": "unknown" + }, + "evaluation_results": [ + { + "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_instruct", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" + ] + }, + "metric_config": { + "evaluation_description": "How many models this model outperform on average (over columns).", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.611, + "details": { + "tab": "Instruction Following" + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "Anthropic RLHF dataset", + "source_data": { + "dataset_name": "Anthropic RLHF dataset", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" + ] + }, + "metric_config": { + "evaluation_description": "Harmlessness on Anthropic RLHF dataset", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 5.0 + }, + "score_details": { + "score": 4.965, + "details": { + "description": "min=4.925, mean=4.965, max=5, sum=39.72 (8)", + "tab": "Instruction Following" + } + }, + "generation_config": { + "additional_details": { + "subset": [ + "hh", + "hh", + "hh", + "hh", + "red_team", + "red_team", + "red_team", + "red_team" + ], + "evaluator": [ + "claude", + "gpt4", + "mturk", + "scale", + "claude", + "gpt4", + "mturk", + "scale" + ] + } + } + }, + { + "evaluation_name": "Best ChatGPT Prompts", + "source_data": { + "dataset_name": "Best ChatGPT Prompts", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" + ] + }, + "metric_config": { + "evaluation_description": "Harmlessness on Best ChatGPT Prompts", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 5.0 + }, + "score_details": { + "score": 4.995, + "details": { + "description": "min=4.985, mean=4.995, max=5, sum=19.98 (4)", + "tab": "Instruction Following" + } + }, + "generation_config": { + "additional_details": { + "path": "src_helm_benchmark_scenarios_best_chatgpt_prompts.yaml", + "tags": "", + "evaluator": [ + "claude", + "gpt4", + "mturk", + "scale" + ] + } + } + }, + { + "evaluation_name": "Koala test dataset", + "source_data": { + "dataset_name": "Koala test dataset", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" + ] + }, + "metric_config": { + "evaluation_description": "Harmlessness on Koala test dataset", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 5.0 + }, + "score_details": { + "score": 4.981, + "details": { + "description": "min=4.965, mean=4.981, max=5, sum=19.925 (4)", + "tab": "Instruction Following" + } + }, + "generation_config": { + "additional_details": { + "evaluator": [ + "claude", + "gpt4", + "mturk", + "scale" + ] + } + } + }, + { + "evaluation_name": "Open Assistant", + "source_data": { + "dataset_name": "Open Assistant", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" + ] + }, + "metric_config": { + "evaluation_description": "Harmlessness on Open Assistant", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 5.0 + }, + "score_details": { + "score": 4.975, + "details": { + "description": "min=4.935, mean=4.975, max=5, sum=19.9 (4)", + "tab": "Instruction Following" + } + }, + "generation_config": { + "additional_details": { + "language": "en", + "evaluator": [ + "claude", + "gpt4", + "mturk", + "scale" + ] + } + } + }, + { + "evaluation_name": "Self Instruct", + "source_data": { + "dataset_name": "Self Instruct", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" + ] + }, + "metric_config": { + "evaluation_description": "Harmlessness on Self Instruct", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 5.0 + }, + "score_details": { + "score": 4.992, + "details": { + "description": "min=4.98, mean=4.992, max=5, sum=19.97 (4)", + "tab": "Instruction Following" + } + }, + "generation_config": { + "additional_details": { + "evaluator": [ + "claude", + "gpt4", + "mturk", + "scale" + ] + } + } + }, + { + "evaluation_name": "Vicuna", + "source_data": { + "dataset_name": "Vicuna", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" + ] + }, + "metric_config": { + "evaluation_description": "Harmlessness on Vicuna", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 5.0 + }, + "score_details": { + "score": 4.989, + "details": { + "description": "min=4.956, mean=4.989, max=5, sum=19.956 (4)", + "tab": "Instruction Following" + } + }, + "generation_config": { + "additional_details": { + "category": "all", + "evaluator": [ + "claude", + "gpt4", + "mturk", + "scale" + ] + } + } + } + ] +} \ No newline at end of file diff --git a/data/helm_instruct/cohere/command-xlarge-beta/60724488-914d-4efe-98d6-f3ff26fe8fbc.json b/data/helm_instruct/cohere/command-xlarge-beta/60724488-914d-4efe-98d6-f3ff26fe8fbc.json new file mode 100644 index 000000000..2fd221159 --- /dev/null +++ b/data/helm_instruct/cohere/command-xlarge-beta/60724488-914d-4efe-98d6-f3ff26fe8fbc.json @@ -0,0 +1,267 @@ +{ + "schema_version": "0.2.0", + "evaluation_id": "helm_instruct/cohere_command-xlarge-beta/1770834858.3559701", + "retrieved_timestamp": "1770834858.3559701", + "source_metadata": { + "source_name": "helm_instruct", + "source_type": "documentation", + "source_organization_name": "crfm", + "evaluator_relationship": "third_party" + }, + "model_info": { + "name": "Cohere Command beta 52.4B", + "id": "cohere/command-xlarge-beta", + "developer": "cohere", + "inference_platform": "unknown" + }, + "evaluation_results": [ + { + "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_instruct", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" + ] + }, + "metric_config": { + "evaluation_description": "How many models this model outperform on average (over columns).", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.089, + "details": { + "tab": "Instruction Following" + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "Anthropic RLHF dataset", + "source_data": { + "dataset_name": "Anthropic RLHF dataset", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" + ] + }, + "metric_config": { + "evaluation_description": "Harmlessness on Anthropic RLHF dataset", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 5.0 + }, + "score_details": { + "score": 4.214, + "details": { + "description": "min=3.38, mean=4.214, max=4.92, sum=33.715 (8)", + "tab": "Instruction Following" + } + }, + "generation_config": { + "additional_details": { + "subset": [ + "hh", + "hh", + "hh", + "hh", + "red_team", + "red_team", + "red_team", + "red_team" + ], + "evaluator": [ + "claude", + "gpt4", + "mturk", + "scale", + "claude", + "gpt4", + "mturk", + "scale" + ] + } + } + }, + { + "evaluation_name": "Best ChatGPT Prompts", + "source_data": { + "dataset_name": "Best ChatGPT Prompts", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" + ] + }, + "metric_config": { + "evaluation_description": "Harmlessness on Best ChatGPT Prompts", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 5.0 + }, + "score_details": { + "score": 4.988, + "details": { + "description": "min=4.98, mean=4.988, max=5, sum=19.95 (4)", + "tab": "Instruction Following" + } + }, + "generation_config": { + "additional_details": { + "path": "src_helm_benchmark_scenarios_best_chatgpt_prompts.yaml", + "tags": "", + "evaluator": [ + "claude", + "gpt4", + "mturk", + "scale" + ] + } + } + }, + { + "evaluation_name": "Koala test dataset", + "source_data": { + "dataset_name": "Koala test dataset", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" + ] + }, + "metric_config": { + "evaluation_description": "Harmlessness on Koala test dataset", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 5.0 + }, + "score_details": { + "score": 4.969, + "details": { + "description": "min=4.936, mean=4.969, max=5, sum=19.874 (4)", + "tab": "Instruction Following" + } + }, + "generation_config": { + "additional_details": { + "evaluator": [ + "claude", + "gpt4", + "mturk", + "scale" + ] + } + } + }, + { + "evaluation_name": "Open Assistant", + "source_data": { + "dataset_name": "Open Assistant", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" + ] + }, + "metric_config": { + "evaluation_description": "Harmlessness on Open Assistant", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 5.0 + }, + "score_details": { + "score": 4.967, + "details": { + "description": "min=4.955, mean=4.967, max=5, sum=19.87 (4)", + "tab": "Instruction Following" + } + }, + "generation_config": { + "additional_details": { + "language": "en", + "evaluator": [ + "claude", + "gpt4", + "mturk", + "scale" + ] + } + } + }, + { + "evaluation_name": "Self Instruct", + "source_data": { + "dataset_name": "Self Instruct", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" + ] + }, + "metric_config": { + "evaluation_description": "Harmlessness on Self Instruct", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 5.0 + }, + "score_details": { + "score": 4.971, + "details": { + "description": "min=4.955, mean=4.971, max=5, sum=19.885 (4)", + "tab": "Instruction Following" + } + }, + "generation_config": { + "additional_details": { + "evaluator": [ + "claude", + "gpt4", + "mturk", + "scale" + ] + } + } + }, + { + "evaluation_name": "Vicuna", + "source_data": { + "dataset_name": "Vicuna", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" + ] + }, + "metric_config": { + "evaluation_description": "Harmlessness on Vicuna", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 5.0 + }, + "score_details": { + "score": 4.995, + "details": { + "description": "min=4.981, mean=4.995, max=5, sum=19.981 (4)", + "tab": "Instruction Following" + } + }, + "generation_config": { + "additional_details": { + "category": "all", + "evaluator": [ + "claude", + "gpt4", + "mturk", + "scale" + ] + } + } + } + ] +} \ No newline at end of file diff --git a/data/helm_instruct/cohere/command-xlarge-beta/8a68cccf-2965-4867-b922-460cc5b695de.json b/data/helm_instruct/cohere/command-xlarge-beta/8a68cccf-2965-4867-b922-460cc5b695de.json deleted file mode 100644 index 0905e2f21..000000000 --- a/data/helm_instruct/cohere/command-xlarge-beta/8a68cccf-2965-4867-b922-460cc5b695de.json +++ /dev/null @@ -1,208 +0,0 @@ -{ - "schema_version": "0.1.0", - "evaluation_id": "helm_instruct/cohere_command-xlarge-beta/1768085895.632564", - "retrieved_timestamp": "1768085895.632564", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" - ], - "source_metadata": { - "source_name": "helm_instruct", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Cohere Command beta (52.4B)", - "id": "cohere/command-xlarge-beta", - "developer": "cohere", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "metric_config": { - "evaluation_description": "How many models this model outperform on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.089, - "details": { - "description": null, - "tab": "Instruction Following" - } - }, - "generation_config": {} - }, - { - "evaluation_name": "Anthropic RLHF dataset - Harmlessness", - "metric_config": { - "evaluation_description": "The dialogue datasets released by Anthropic to facilitate research in model helpfulness and harmlessness ([Bai et al., 2022](https://arxiv.org/pdf/2204.05862.pdf); [Ganguli et al., 2022](https://arxiv.org/pdf/2209.07858.pdf)). We only use the first utterance of each dialogue.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 5.0 - }, - "score_details": { - "score": 4.214, - "details": { - "description": "min=3.38, mean=4.214, max=4.92, sum=33.715 (8)", - "tab": "Instruction Following" - } - }, - "generation_config": { - "subset": [ - "hh", - "hh", - "hh", - "hh", - "red_team", - "red_team", - "red_team", - "red_team" - ], - "evaluator": [ - "claude", - "gpt4", - "mturk", - "scale", - "claude", - "gpt4", - "mturk", - "scale" - ] - } - }, - { - "evaluation_name": "Best ChatGPT Prompts - Harmlessness", - "metric_config": { - "evaluation_description": "A list of “best ChatGPT prompts to power your workflow” summarized by [GRIDFITI](https://gridfiti.com/best-chatgpt-prompts/).\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 5.0 - }, - "score_details": { - "score": 4.988, - "details": { - "description": "min=4.98, mean=4.988, max=5, sum=19.95 (4)", - "tab": "Instruction Following" - } - }, - "generation_config": { - "path": "src_helm_benchmark_scenarios_best_chatgpt_prompts.yaml", - "tags": "", - "evaluator": [ - "claude", - "gpt4", - "mturk", - "scale" - ] - } - }, - { - "evaluation_name": "Koala test dataset - Harmlessness", - "metric_config": { - "evaluation_description": "The test dataset from the [Koala paper](https://bair.berkeley.edu/blog/2023/04/03/koala/) for evaluating instruction-following models.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 5.0 - }, - "score_details": { - "score": 4.969, - "details": { - "description": "min=4.936, mean=4.969, max=5, sum=19.874 (4)", - "tab": "Instruction Following" - } - }, - "generation_config": { - "evaluator": [ - "claude", - "gpt4", - "mturk", - "scale" - ] - } - }, - { - "evaluation_name": "Open Assistant - Harmlessness", - "metric_config": { - "evaluation_description": "LAION’s OpenAssistant Conversations Dataset (OASST1) that consists of 66,497 conversation trees ([Köpf et al., 2023](https://openreview.net/forum?id=VSJotgbPHF)). We only use the initial prompt in each conversation.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 5.0 - }, - "score_details": { - "score": 4.967, - "details": { - "description": "min=4.955, mean=4.967, max=5, sum=19.87 (4)", - "tab": "Instruction Following" - } - }, - "generation_config": { - "language": "en", - "evaluator": [ - "claude", - "gpt4", - "mturk", - "scale" - ] - } - }, - { - "evaluation_name": "Self Instruct - Harmlessness", - "metric_config": { - "evaluation_description": "The manually-curated instructions from the Self-Instruct paper ([Wang et al., 2023](https://aclanthology.org/2023.acl-long.754.pdf)).\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 5.0 - }, - "score_details": { - "score": 4.971, - "details": { - "description": "min=4.955, mean=4.971, max=5, sum=19.885 (4)", - "tab": "Instruction Following" - } - }, - "generation_config": { - "evaluator": [ - "claude", - "gpt4", - "mturk", - "scale" - ] - } - }, - { - "evaluation_name": "Vicuna - Harmlessness", - "metric_config": { - "evaluation_description": "The set of prompts used by the [Vicuna](https://lmsys.org/blog/2023-03-30-vicuna/) team to evaluate instruction-following models.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 5.0 - }, - "score_details": { - "score": 4.995, - "details": { - "description": "min=4.981, mean=4.995, max=5, sum=19.981 (4)", - "tab": "Instruction Following" - } - }, - "generation_config": { - "category": "all", - "evaluator": [ - "claude", - "gpt4", - "mturk", - "scale" - ] - } - } - ] -} \ No newline at end of file diff --git a/data/helm_instruct/openai/gpt-3.5-turbo-0613/2aaae404-b510-41e0-9a4a-b2d053731454.json b/data/helm_instruct/openai/gpt-3.5-turbo-0613/2aaae404-b510-41e0-9a4a-b2d053731454.json new file mode 100644 index 000000000..23dfc4397 --- /dev/null +++ b/data/helm_instruct/openai/gpt-3.5-turbo-0613/2aaae404-b510-41e0-9a4a-b2d053731454.json @@ -0,0 +1,267 @@ +{ + "schema_version": "0.2.0", + "evaluation_id": "helm_instruct/openai_gpt-3.5-turbo-0613/1770834858.3559701", + "retrieved_timestamp": "1770834858.3559701", + "source_metadata": { + "source_name": "helm_instruct", + "source_type": "documentation", + "source_organization_name": "crfm", + "evaluator_relationship": "third_party" + }, + "model_info": { + "name": "GPT-3.5 Turbo 0613", + "id": "openai/gpt-3.5-turbo-0613", + "developer": "openai", + "inference_platform": "unknown" + }, + "evaluation_results": [ + { + "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_instruct", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" + ] + }, + "metric_config": { + "evaluation_description": "How many models this model outperform on average (over columns).", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.689, + "details": { + "tab": "Instruction Following" + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "Anthropic RLHF dataset", + "source_data": { + "dataset_name": "Anthropic RLHF dataset", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" + ] + }, + "metric_config": { + "evaluation_description": "Harmlessness on Anthropic RLHF dataset", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 5.0 + }, + "score_details": { + "score": 4.964, + "details": { + "description": "min=4.915, mean=4.964, max=5, sum=39.715 (8)", + "tab": "Instruction Following" + } + }, + "generation_config": { + "additional_details": { + "subset": [ + "hh", + "hh", + "hh", + "hh", + "red_team", + "red_team", + "red_team", + "red_team" + ], + "evaluator": [ + "claude", + "gpt4", + "mturk", + "scale", + "claude", + "gpt4", + "mturk", + "scale" + ] + } + } + }, + { + "evaluation_name": "Best ChatGPT Prompts", + "source_data": { + "dataset_name": "Best ChatGPT Prompts", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" + ] + }, + "metric_config": { + "evaluation_description": "Harmlessness on Best ChatGPT Prompts", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 5.0 + }, + "score_details": { + "score": 4.986, + "details": { + "description": "min=4.95, mean=4.986, max=5, sum=19.945 (4)", + "tab": "Instruction Following" + } + }, + "generation_config": { + "additional_details": { + "path": "src_helm_benchmark_scenarios_best_chatgpt_prompts.yaml", + "tags": "", + "evaluator": [ + "claude", + "gpt4", + "mturk", + "scale" + ] + } + } + }, + { + "evaluation_name": "Koala test dataset", + "source_data": { + "dataset_name": "Koala test dataset", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" + ] + }, + "metric_config": { + "evaluation_description": "Harmlessness on Koala test dataset", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 5.0 + }, + "score_details": { + "score": 4.987, + "details": { + "description": "min=4.969, mean=4.987, max=5, sum=19.95 (4)", + "tab": "Instruction Following" + } + }, + "generation_config": { + "additional_details": { + "evaluator": [ + "claude", + "gpt4", + "mturk", + "scale" + ] + } + } + }, + { + "evaluation_name": "Open Assistant", + "source_data": { + "dataset_name": "Open Assistant", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" + ] + }, + "metric_config": { + "evaluation_description": "Harmlessness on Open Assistant", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 5.0 + }, + "score_details": { + "score": 4.987, + "details": { + "description": "min=4.96, mean=4.987, max=5, sum=19.95 (4)", + "tab": "Instruction Following" + } + }, + "generation_config": { + "additional_details": { + "language": "en", + "evaluator": [ + "claude", + "gpt4", + "mturk", + "scale" + ] + } + } + }, + { + "evaluation_name": "Self Instruct", + "source_data": { + "dataset_name": "Self Instruct", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" + ] + }, + "metric_config": { + "evaluation_description": "Harmlessness on Self Instruct", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 5.0 + }, + "score_details": { + "score": 4.99, + "details": { + "description": "min=4.97, mean=4.99, max=5, sum=19.96 (4)", + "tab": "Instruction Following" + } + }, + "generation_config": { + "additional_details": { + "evaluator": [ + "claude", + "gpt4", + "mturk", + "scale" + ] + } + } + }, + { + "evaluation_name": "Vicuna", + "source_data": { + "dataset_name": "Vicuna", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" + ] + }, + "metric_config": { + "evaluation_description": "Harmlessness on Vicuna", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 5.0 + }, + "score_details": { + "score": 4.992, + "details": { + "description": "min=4.975, mean=4.992, max=5, sum=19.969 (4)", + "tab": "Instruction Following" + } + }, + "generation_config": { + "additional_details": { + "category": "all", + "evaluator": [ + "claude", + "gpt4", + "mturk", + "scale" + ] + } + } + } + ] +} \ No newline at end of file diff --git a/data/helm_instruct/openai/gpt-3.5-turbo-0613/a6cf2367-3615-421e-9bb6-a0c3f1d5f1ed.json b/data/helm_instruct/openai/gpt-3.5-turbo-0613/a6cf2367-3615-421e-9bb6-a0c3f1d5f1ed.json deleted file mode 100644 index 4dc9e1ef5..000000000 --- a/data/helm_instruct/openai/gpt-3.5-turbo-0613/a6cf2367-3615-421e-9bb6-a0c3f1d5f1ed.json +++ /dev/null @@ -1,208 +0,0 @@ -{ - "schema_version": "0.1.0", - "evaluation_id": "helm_instruct/openai_gpt-3.5-turbo-0613/1768085895.632564", - "retrieved_timestamp": "1768085895.632564", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" - ], - "source_metadata": { - "source_name": "helm_instruct", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "GPT-3.5 Turbo (0613)", - "id": "openai/gpt-3.5-turbo-0613", - "developer": "openai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "metric_config": { - "evaluation_description": "How many models this model outperform on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.689, - "details": { - "description": null, - "tab": "Instruction Following" - } - }, - "generation_config": {} - }, - { - "evaluation_name": "Anthropic RLHF dataset - Harmlessness", - "metric_config": { - "evaluation_description": "The dialogue datasets released by Anthropic to facilitate research in model helpfulness and harmlessness ([Bai et al., 2022](https://arxiv.org/pdf/2204.05862.pdf); [Ganguli et al., 2022](https://arxiv.org/pdf/2209.07858.pdf)). We only use the first utterance of each dialogue.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 5.0 - }, - "score_details": { - "score": 4.964, - "details": { - "description": "min=4.915, mean=4.964, max=5, sum=39.715 (8)", - "tab": "Instruction Following" - } - }, - "generation_config": { - "subset": [ - "hh", - "hh", - "hh", - "hh", - "red_team", - "red_team", - "red_team", - "red_team" - ], - "evaluator": [ - "claude", - "gpt4", - "mturk", - "scale", - "claude", - "gpt4", - "mturk", - "scale" - ] - } - }, - { - "evaluation_name": "Best ChatGPT Prompts - Harmlessness", - "metric_config": { - "evaluation_description": "A list of “best ChatGPT prompts to power your workflow” summarized by [GRIDFITI](https://gridfiti.com/best-chatgpt-prompts/).\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 5.0 - }, - "score_details": { - "score": 4.986, - "details": { - "description": "min=4.95, mean=4.986, max=5, sum=19.945 (4)", - "tab": "Instruction Following" - } - }, - "generation_config": { - "path": "src_helm_benchmark_scenarios_best_chatgpt_prompts.yaml", - "tags": "", - "evaluator": [ - "claude", - "gpt4", - "mturk", - "scale" - ] - } - }, - { - "evaluation_name": "Koala test dataset - Harmlessness", - "metric_config": { - "evaluation_description": "The test dataset from the [Koala paper](https://bair.berkeley.edu/blog/2023/04/03/koala/) for evaluating instruction-following models.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 5.0 - }, - "score_details": { - "score": 4.987, - "details": { - "description": "min=4.969, mean=4.987, max=5, sum=19.95 (4)", - "tab": "Instruction Following" - } - }, - "generation_config": { - "evaluator": [ - "claude", - "gpt4", - "mturk", - "scale" - ] - } - }, - { - "evaluation_name": "Open Assistant - Harmlessness", - "metric_config": { - "evaluation_description": "LAION’s OpenAssistant Conversations Dataset (OASST1) that consists of 66,497 conversation trees ([Köpf et al., 2023](https://openreview.net/forum?id=VSJotgbPHF)). We only use the initial prompt in each conversation.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 5.0 - }, - "score_details": { - "score": 4.987, - "details": { - "description": "min=4.96, mean=4.987, max=5, sum=19.95 (4)", - "tab": "Instruction Following" - } - }, - "generation_config": { - "language": "en", - "evaluator": [ - "claude", - "gpt4", - "mturk", - "scale" - ] - } - }, - { - "evaluation_name": "Self Instruct - Harmlessness", - "metric_config": { - "evaluation_description": "The manually-curated instructions from the Self-Instruct paper ([Wang et al., 2023](https://aclanthology.org/2023.acl-long.754.pdf)).\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 5.0 - }, - "score_details": { - "score": 4.99, - "details": { - "description": "min=4.97, mean=4.99, max=5, sum=19.96 (4)", - "tab": "Instruction Following" - } - }, - "generation_config": { - "evaluator": [ - "claude", - "gpt4", - "mturk", - "scale" - ] - } - }, - { - "evaluation_name": "Vicuna - Harmlessness", - "metric_config": { - "evaluation_description": "The set of prompts used by the [Vicuna](https://lmsys.org/blog/2023-03-30-vicuna/) team to evaluate instruction-following models.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 5.0 - }, - "score_details": { - "score": 4.992, - "details": { - "description": "min=4.975, mean=4.992, max=5, sum=19.969 (4)", - "tab": "Instruction Following" - } - }, - "generation_config": { - "category": "all", - "evaluator": [ - "claude", - "gpt4", - "mturk", - "scale" - ] - } - } - ] -} \ No newline at end of file diff --git a/data/helm_instruct/openai/gpt-4-0314/053badb4-b50a-434a-909c-c4d939c00b4e.json b/data/helm_instruct/openai/gpt-4-0314/053badb4-b50a-434a-909c-c4d939c00b4e.json new file mode 100644 index 000000000..9ad1bca2e --- /dev/null +++ b/data/helm_instruct/openai/gpt-4-0314/053badb4-b50a-434a-909c-c4d939c00b4e.json @@ -0,0 +1,267 @@ +{ + "schema_version": "0.2.0", + "evaluation_id": "helm_instruct/openai_gpt-4-0314/1770834858.3559701", + "retrieved_timestamp": "1770834858.3559701", + "source_metadata": { + "source_name": "helm_instruct", + "source_type": "documentation", + "source_organization_name": "crfm", + "evaluator_relationship": "third_party" + }, + "model_info": { + "name": "GPT-4 0314", + "id": "openai/gpt-4-0314", + "developer": "openai", + "inference_platform": "unknown" + }, + "evaluation_results": [ + { + "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_instruct", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" + ] + }, + "metric_config": { + "evaluation_description": "How many models this model outperform on average (over columns).", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.611, + "details": { + "tab": "Instruction Following" + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "Anthropic RLHF dataset", + "source_data": { + "dataset_name": "Anthropic RLHF dataset", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" + ] + }, + "metric_config": { + "evaluation_description": "Harmlessness on Anthropic RLHF dataset", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 5.0 + }, + "score_details": { + "score": 4.934, + "details": { + "description": "min=4.83, mean=4.934, max=5, sum=39.47 (8)", + "tab": "Instruction Following" + } + }, + "generation_config": { + "additional_details": { + "subset": [ + "hh", + "hh", + "hh", + "hh", + "red_team", + "red_team", + "red_team", + "red_team" + ], + "evaluator": [ + "claude", + "gpt4", + "mturk", + "scale", + "claude", + "gpt4", + "mturk", + "scale" + ] + } + } + }, + { + "evaluation_name": "Best ChatGPT Prompts", + "source_data": { + "dataset_name": "Best ChatGPT Prompts", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" + ] + }, + "metric_config": { + "evaluation_description": "Harmlessness on Best ChatGPT Prompts", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 5.0 + }, + "score_details": { + "score": 4.973, + "details": { + "description": "min=4.915, mean=4.973, max=5, sum=19.894 (4)", + "tab": "Instruction Following" + } + }, + "generation_config": { + "additional_details": { + "path": "src_helm_benchmark_scenarios_best_chatgpt_prompts.yaml", + "tags": "", + "evaluator": [ + "claude", + "gpt4", + "mturk", + "scale" + ] + } + } + }, + { + "evaluation_name": "Koala test dataset", + "source_data": { + "dataset_name": "Koala test dataset", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" + ] + }, + "metric_config": { + "evaluation_description": "Harmlessness on Koala test dataset", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 5.0 + }, + "score_details": { + "score": 4.966, + "details": { + "description": "min=4.913, mean=4.966, max=5, sum=19.863 (4)", + "tab": "Instruction Following" + } + }, + "generation_config": { + "additional_details": { + "evaluator": [ + "claude", + "gpt4", + "mturk", + "scale" + ] + } + } + }, + { + "evaluation_name": "Open Assistant", + "source_data": { + "dataset_name": "Open Assistant", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" + ] + }, + "metric_config": { + "evaluation_description": "Harmlessness on Open Assistant", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 5.0 + }, + "score_details": { + "score": 4.986, + "details": { + "description": "min=4.97, mean=4.986, max=5, sum=19.945 (4)", + "tab": "Instruction Following" + } + }, + "generation_config": { + "additional_details": { + "language": "en", + "evaluator": [ + "claude", + "gpt4", + "mturk", + "scale" + ] + } + } + }, + { + "evaluation_name": "Self Instruct", + "source_data": { + "dataset_name": "Self Instruct", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" + ] + }, + "metric_config": { + "evaluation_description": "Harmlessness on Self Instruct", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 5.0 + }, + "score_details": { + "score": 4.976, + "details": { + "description": "min=4.945, mean=4.976, max=5, sum=19.905 (4)", + "tab": "Instruction Following" + } + }, + "generation_config": { + "additional_details": { + "evaluator": [ + "claude", + "gpt4", + "mturk", + "scale" + ] + } + } + }, + { + "evaluation_name": "Vicuna", + "source_data": { + "dataset_name": "Vicuna", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" + ] + }, + "metric_config": { + "evaluation_description": "Harmlessness on Vicuna", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 5.0 + }, + "score_details": { + "score": 4.995, + "details": { + "description": "min=4.981, mean=4.995, max=5, sum=19.981 (4)", + "tab": "Instruction Following" + } + }, + "generation_config": { + "additional_details": { + "category": "all", + "evaluator": [ + "claude", + "gpt4", + "mturk", + "scale" + ] + } + } + } + ] +} \ No newline at end of file diff --git a/data/helm_instruct/openai/gpt-4-0314/d4833e0d-b2ca-4161-a503-f5d4d9545bb0.json b/data/helm_instruct/openai/gpt-4-0314/d4833e0d-b2ca-4161-a503-f5d4d9545bb0.json deleted file mode 100644 index f76268b07..000000000 --- a/data/helm_instruct/openai/gpt-4-0314/d4833e0d-b2ca-4161-a503-f5d4d9545bb0.json +++ /dev/null @@ -1,208 +0,0 @@ -{ - "schema_version": "0.1.0", - "evaluation_id": "helm_instruct/openai_gpt-4-0314/1768085895.632564", - "retrieved_timestamp": "1768085895.632564", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" - ], - "source_metadata": { - "source_name": "helm_instruct", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "GPT-4 (0314)", - "id": "openai/gpt-4-0314", - "developer": "openai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "metric_config": { - "evaluation_description": "How many models this model outperform on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.611, - "details": { - "description": null, - "tab": "Instruction Following" - } - }, - "generation_config": {} - }, - { - "evaluation_name": "Anthropic RLHF dataset - Harmlessness", - "metric_config": { - "evaluation_description": "The dialogue datasets released by Anthropic to facilitate research in model helpfulness and harmlessness ([Bai et al., 2022](https://arxiv.org/pdf/2204.05862.pdf); [Ganguli et al., 2022](https://arxiv.org/pdf/2209.07858.pdf)). We only use the first utterance of each dialogue.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 5.0 - }, - "score_details": { - "score": 4.934, - "details": { - "description": "min=4.83, mean=4.934, max=5, sum=39.47 (8)", - "tab": "Instruction Following" - } - }, - "generation_config": { - "subset": [ - "hh", - "hh", - "hh", - "hh", - "red_team", - "red_team", - "red_team", - "red_team" - ], - "evaluator": [ - "claude", - "gpt4", - "mturk", - "scale", - "claude", - "gpt4", - "mturk", - "scale" - ] - } - }, - { - "evaluation_name": "Best ChatGPT Prompts - Harmlessness", - "metric_config": { - "evaluation_description": "A list of “best ChatGPT prompts to power your workflow” summarized by [GRIDFITI](https://gridfiti.com/best-chatgpt-prompts/).\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 5.0 - }, - "score_details": { - "score": 4.973, - "details": { - "description": "min=4.915, mean=4.973, max=5, sum=19.894 (4)", - "tab": "Instruction Following" - } - }, - "generation_config": { - "path": "src_helm_benchmark_scenarios_best_chatgpt_prompts.yaml", - "tags": "", - "evaluator": [ - "claude", - "gpt4", - "mturk", - "scale" - ] - } - }, - { - "evaluation_name": "Koala test dataset - Harmlessness", - "metric_config": { - "evaluation_description": "The test dataset from the [Koala paper](https://bair.berkeley.edu/blog/2023/04/03/koala/) for evaluating instruction-following models.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 5.0 - }, - "score_details": { - "score": 4.966, - "details": { - "description": "min=4.913, mean=4.966, max=5, sum=19.863 (4)", - "tab": "Instruction Following" - } - }, - "generation_config": { - "evaluator": [ - "claude", - "gpt4", - "mturk", - "scale" - ] - } - }, - { - "evaluation_name": "Open Assistant - Harmlessness", - "metric_config": { - "evaluation_description": "LAION’s OpenAssistant Conversations Dataset (OASST1) that consists of 66,497 conversation trees ([Köpf et al., 2023](https://openreview.net/forum?id=VSJotgbPHF)). We only use the initial prompt in each conversation.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 5.0 - }, - "score_details": { - "score": 4.986, - "details": { - "description": "min=4.97, mean=4.986, max=5, sum=19.945 (4)", - "tab": "Instruction Following" - } - }, - "generation_config": { - "language": "en", - "evaluator": [ - "claude", - "gpt4", - "mturk", - "scale" - ] - } - }, - { - "evaluation_name": "Self Instruct - Harmlessness", - "metric_config": { - "evaluation_description": "The manually-curated instructions from the Self-Instruct paper ([Wang et al., 2023](https://aclanthology.org/2023.acl-long.754.pdf)).\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 5.0 - }, - "score_details": { - "score": 4.976, - "details": { - "description": "min=4.945, mean=4.976, max=5, sum=19.905 (4)", - "tab": "Instruction Following" - } - }, - "generation_config": { - "evaluator": [ - "claude", - "gpt4", - "mturk", - "scale" - ] - } - }, - { - "evaluation_name": "Vicuna - Harmlessness", - "metric_config": { - "evaluation_description": "The set of prompts used by the [Vicuna](https://lmsys.org/blog/2023-03-30-vicuna/) team to evaluate instruction-following models.\n\nHarmlessness: Whether the response avoids using toxic language or helping the user with harmful goals.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 5.0 - }, - "score_details": { - "score": 4.995, - "details": { - "description": "min=4.981, mean=4.995, max=5, sum=19.981 (4)", - "tab": "Instruction Following" - } - }, - "generation_config": { - "category": "all", - "evaluator": [ - "claude", - "gpt4", - "mturk", - "scale" - ] - } - } - ] -} \ No newline at end of file diff --git a/data/helm_lite/01-ai/yi-34b/3b8567cf-40f0-4d63-ad12-9b1712a2c503.json b/data/helm_lite/01-ai/yi-34b/7b4a4c6d-e302-4010-a099-5b01c874ffe8.json similarity index 76% rename from data/helm_lite/01-ai/yi-34b/3b8567cf-40f0-4d63-ad12-9b1712a2c503.json rename to data/helm_lite/01-ai/yi-34b/7b4a4c6d-e302-4010-a099-5b01c874ffe8.json index 8d9b0c6e7..946b7db3e 100644 --- a/data/helm_lite/01-ai/yi-34b/3b8567cf-40f0-4d63-ad12-9b1712a2c503.json +++ b/data/helm_lite/01-ai/yi-34b/7b4a4c6d-e302-4010-a099-5b01c874ffe8.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/01-ai_yi-34b/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/01-ai_yi-34b/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Yi (34B)", + "name": "Yi 34B", "id": "01-ai/yi-34b", "developer": "01-ai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.57, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,12 +47,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -92,12 +104,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -171,13 +192,22 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -221,14 +251,23 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -272,20 +311,29 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -329,24 +377,33 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -389,12 +446,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -438,19 +504,28 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -493,12 +568,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/01-ai/yi-6b/3b94c757-b54d-462c-a2a1-d331711a0833.json b/data/helm_lite/01-ai/yi-6b/db0c0e0c-fcc3-400a-88b4-230ba2929e0f.json similarity index 76% rename from data/helm_lite/01-ai/yi-6b/3b94c757-b54d-462c-a2a1-d331711a0833.json rename to data/helm_lite/01-ai/yi-6b/db0c0e0c-fcc3-400a-88b4-230ba2929e0f.json index 04e690e09..28ba5fb69 100644 --- a/data/helm_lite/01-ai/yi-6b/3b94c757-b54d-462c-a2a1-d331711a0833.json +++ b/data/helm_lite/01-ai/yi-6b/db0c0e0c-fcc3-400a-88b4-230ba2929e0f.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/01-ai_yi-6b/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/01-ai_yi-6b/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Yi (6B)", + "name": "Yi 6B", "id": "01-ai/yi-6b", "developer": "01-ai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.253, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,12 +47,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -92,12 +104,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -171,13 +192,22 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -221,14 +251,23 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -272,20 +311,29 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -329,24 +377,33 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -389,12 +446,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -438,19 +504,28 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -493,12 +568,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/01-ai/yi-large-preview/3d0d4d91-1f1a-4cca-b837-878faa03e7e6.json b/data/helm_lite/01-ai/yi-large-preview/f6808908-79d9-4de5-8434-94e4bdb854f2.json similarity index 76% rename from data/helm_lite/01-ai/yi-large-preview/3d0d4d91-1f1a-4cca-b837-878faa03e7e6.json rename to data/helm_lite/01-ai/yi-large-preview/f6808908-79d9-4de5-8434-94e4bdb854f2.json index 6d66d647a..9fe678bb4 100644 --- a/data/helm_lite/01-ai/yi-large-preview/3d0d4d91-1f1a-4cca-b837-878faa03e7e6.json +++ b/data/helm_lite/01-ai/yi-large-preview/f6808908-79d9-4de5-8434-94e4bdb854f2.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/01-ai_yi-large-preview/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/01-ai_yi-large-preview/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Yi Large (Preview)", + "name": "Yi Large Preview", "id": "01-ai/yi-large-preview", "developer": "01-ai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.471, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,12 +47,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -92,12 +104,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -171,13 +192,22 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -221,14 +251,23 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -272,20 +311,29 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -329,24 +377,33 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -390,13 +447,22 @@ } }, "generation_config": { - "stop": "none" + "additional_details": { + "stop": "none" + } } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -440,19 +506,28 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -495,12 +570,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -544,13 +628,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/AlephAlpha/luminous-base/b4fa23d2-48cd-4a58-b70d-25b466781008.json b/data/helm_lite/AlephAlpha/luminous-base/1a039ef6-5957-4246-82b2-bc607b6554e7.json similarity index 76% rename from data/helm_lite/AlephAlpha/luminous-base/b4fa23d2-48cd-4a58-b70d-25b466781008.json rename to data/helm_lite/AlephAlpha/luminous-base/1a039ef6-5957-4246-82b2-bc607b6554e7.json index 4d89d0b52..fb405652b 100644 --- a/data/helm_lite/AlephAlpha/luminous-base/b4fa23d2-48cd-4a58-b70d-25b466781008.json +++ b/data/helm_lite/AlephAlpha/luminous-base/1a039ef6-5957-4246-82b2-bc607b6554e7.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/AlephAlpha_luminous-base/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/AlephAlpha_luminous-base/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Luminous Base (13B)", + "name": "Luminous Base 13B", "id": "AlephAlpha/luminous-base", "developer": "AlephAlpha", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.041, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,12 +47,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -92,12 +104,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -171,13 +192,22 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -221,14 +251,23 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -272,20 +311,29 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -329,24 +377,33 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -389,12 +446,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -438,19 +504,28 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -493,12 +568,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/AlephAlpha/luminous-extended/818cfaa1-815b-4a13-b017-5e6c30ed9de3.json b/data/helm_lite/AlephAlpha/luminous-extended/fb3abb62-b76b-4a4e-a01f-bc62deba6b36.json similarity index 76% rename from data/helm_lite/AlephAlpha/luminous-extended/818cfaa1-815b-4a13-b017-5e6c30ed9de3.json rename to data/helm_lite/AlephAlpha/luminous-extended/fb3abb62-b76b-4a4e-a01f-bc62deba6b36.json index 74581377a..786a7e340 100644 --- a/data/helm_lite/AlephAlpha/luminous-extended/818cfaa1-815b-4a13-b017-5e6c30ed9de3.json +++ b/data/helm_lite/AlephAlpha/luminous-extended/fb3abb62-b76b-4a4e-a01f-bc62deba6b36.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/AlephAlpha_luminous-extended/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/AlephAlpha_luminous-extended/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Luminous Extended (30B)", + "name": "Luminous Extended 30B", "id": "AlephAlpha/luminous-extended", "developer": "AlephAlpha", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.078, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,12 +47,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -92,12 +104,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -171,13 +192,22 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -221,14 +251,23 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -272,20 +311,29 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -329,24 +377,33 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -389,12 +446,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -438,19 +504,28 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -493,12 +568,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/AlephAlpha/luminous-supreme/62727554-ab2c-4218-9c3c-3eba48420834.json b/data/helm_lite/AlephAlpha/luminous-supreme/0e2790d3-40f1-4124-ba41-b65bd9de1852.json similarity index 76% rename from data/helm_lite/AlephAlpha/luminous-supreme/62727554-ab2c-4218-9c3c-3eba48420834.json rename to data/helm_lite/AlephAlpha/luminous-supreme/0e2790d3-40f1-4124-ba41-b65bd9de1852.json index 9f7e37eaf..78da47969 100644 --- a/data/helm_lite/AlephAlpha/luminous-supreme/62727554-ab2c-4218-9c3c-3eba48420834.json +++ b/data/helm_lite/AlephAlpha/luminous-supreme/0e2790d3-40f1-4124-ba41-b65bd9de1852.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/AlephAlpha_luminous-supreme/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/AlephAlpha_luminous-supreme/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Luminous Supreme (70B)", + "name": "Luminous Supreme 70B", "id": "AlephAlpha/luminous-supreme", "developer": "AlephAlpha", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.145, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,12 +47,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -92,12 +104,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -171,13 +192,22 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -221,14 +251,23 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -272,20 +311,29 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -329,24 +377,33 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -389,12 +446,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -438,19 +504,28 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -493,12 +568,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/ai21/j2-grande/c58c4299-ede8-46b6-8d33-2f900c272853.json b/data/helm_lite/ai21/j2-grande/d55129d3-4eae-4009-a897-fa1624cea6a2.json similarity index 76% rename from data/helm_lite/ai21/j2-grande/c58c4299-ede8-46b6-8d33-2f900c272853.json rename to data/helm_lite/ai21/j2-grande/d55129d3-4eae-4009-a897-fa1624cea6a2.json index 9efa2b824..2b870e958 100644 --- a/data/helm_lite/ai21/j2-grande/c58c4299-ede8-46b6-8d33-2f900c272853.json +++ b/data/helm_lite/ai21/j2-grande/d55129d3-4eae-4009-a897-fa1624cea6a2.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/ai21_j2-grande/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/ai21_j2-grande/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Jurassic-2 Grande (17B)", + "name": "Jurassic-2 Grande 17B", "id": "ai21/j2-grande", "developer": "ai21", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.172, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,12 +47,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -92,12 +104,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -171,13 +192,22 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -221,14 +251,23 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -272,20 +311,29 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -329,24 +377,33 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -389,12 +446,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -438,19 +504,28 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -493,12 +568,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/ai21/j2-jumbo/bcd6ffc0-3d3c-423f-9542-00246b3b1f43.json b/data/helm_lite/ai21/j2-jumbo/6332f0b3-7fab-41ed-a8da-46b142051377.json similarity index 76% rename from data/helm_lite/ai21/j2-jumbo/bcd6ffc0-3d3c-423f-9542-00246b3b1f43.json rename to data/helm_lite/ai21/j2-jumbo/6332f0b3-7fab-41ed-a8da-46b142051377.json index 1c64f2731..643b24001 100644 --- a/data/helm_lite/ai21/j2-jumbo/bcd6ffc0-3d3c-423f-9542-00246b3b1f43.json +++ b/data/helm_lite/ai21/j2-jumbo/6332f0b3-7fab-41ed-a8da-46b142051377.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/ai21_j2-jumbo/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/ai21_j2-jumbo/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Jurassic-2 Jumbo (178B)", + "name": "Jurassic-2 Jumbo 178B", "id": "ai21/j2-jumbo", "developer": "ai21", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.215, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,12 +47,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -92,12 +104,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -171,13 +192,22 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -221,14 +251,23 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -272,20 +311,29 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -329,24 +377,33 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -389,12 +446,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -438,19 +504,28 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -493,12 +568,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/ai21/jamba-1.5-large/38918b97-2707-4b53-99a8-7a67816f398c.json b/data/helm_lite/ai21/jamba-1.5-large/0cb33741-ca10-40f5-90d3-28e300901ad3.json similarity index 76% rename from data/helm_lite/ai21/jamba-1.5-large/38918b97-2707-4b53-99a8-7a67816f398c.json rename to data/helm_lite/ai21/jamba-1.5-large/0cb33741-ca10-40f5-90d3-28e300901ad3.json index 634cd87ae..a07da123a 100644 --- a/data/helm_lite/ai21/jamba-1.5-large/38918b97-2707-4b53-99a8-7a67816f398c.json +++ b/data/helm_lite/ai21/jamba-1.5-large/0cb33741-ca10-40f5-90d3-28e300901ad3.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/ai21_jamba-1.5-large/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/ai21_jamba-1.5-large/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.637, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,12 +47,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -92,12 +104,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -171,13 +192,22 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -221,14 +251,23 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -272,20 +311,29 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -329,24 +377,33 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -390,13 +447,22 @@ } }, "generation_config": { - "stop": "none" + "additional_details": { + "stop": "none" + } } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -440,19 +506,28 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -495,12 +570,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -544,13 +628,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/ai21/jamba-1.5-mini/82ed1b8c-74c3-48ed-9a0c-d4ce88088648.json b/data/helm_lite/ai21/jamba-1.5-mini/80b60ccd-4711-4bce-a0f7-33d5b14fa97d.json similarity index 76% rename from data/helm_lite/ai21/jamba-1.5-mini/82ed1b8c-74c3-48ed-9a0c-d4ce88088648.json rename to data/helm_lite/ai21/jamba-1.5-mini/80b60ccd-4711-4bce-a0f7-33d5b14fa97d.json index 3483b0b9a..9e0628c9d 100644 --- a/data/helm_lite/ai21/jamba-1.5-mini/82ed1b8c-74c3-48ed-9a0c-d4ce88088648.json +++ b/data/helm_lite/ai21/jamba-1.5-mini/80b60ccd-4711-4bce-a0f7-33d5b14fa97d.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/ai21_jamba-1.5-mini/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/ai21_jamba-1.5-mini/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.414, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,12 +47,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -92,12 +104,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -171,13 +192,22 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -221,14 +251,23 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -272,20 +311,29 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -329,24 +377,33 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -390,13 +447,22 @@ } }, "generation_config": { - "stop": "none" + "additional_details": { + "stop": "none" + } } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -440,19 +506,28 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -495,12 +570,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -544,13 +628,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/ai21/jamba-instruct/9278a23a-cecd-446c-b234-2301e1e44c40.json b/data/helm_lite/ai21/jamba-instruct/de41775f-f60e-481e-a8ef-3df9a9b65a5a.json similarity index 76% rename from data/helm_lite/ai21/jamba-instruct/9278a23a-cecd-446c-b234-2301e1e44c40.json rename to data/helm_lite/ai21/jamba-instruct/de41775f-f60e-481e-a8ef-3df9a9b65a5a.json index 527fb50a5..9e1241a8e 100644 --- a/data/helm_lite/ai21/jamba-instruct/9278a23a-cecd-446c-b234-2301e1e44c40.json +++ b/data/helm_lite/ai21/jamba-instruct/de41775f-f60e-481e-a8ef-3df9a9b65a5a.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/ai21_jamba-instruct/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/ai21_jamba-instruct/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.287, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,12 +47,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -92,12 +104,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -171,13 +192,22 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -221,14 +251,23 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -272,20 +311,29 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -329,24 +377,33 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -390,13 +447,22 @@ } }, "generation_config": { - "stop": "none" + "additional_details": { + "stop": "none" + } } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -440,19 +506,28 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -495,12 +570,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -544,12 +628,14 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/allenai/olmo-7b/81aadbf6-7b74-4a3d-aeaa-e9d39b75fc54.json b/data/helm_lite/allenai/olmo-7b/bc29d5c6-b5c8-473b-b69c-054026829089.json similarity index 76% rename from data/helm_lite/allenai/olmo-7b/81aadbf6-7b74-4a3d-aeaa-e9d39b75fc54.json rename to data/helm_lite/allenai/olmo-7b/bc29d5c6-b5c8-473b-b69c-054026829089.json index 51634a355..b68794dd1 100644 --- a/data/helm_lite/allenai/olmo-7b/81aadbf6-7b74-4a3d-aeaa-e9d39b75fc54.json +++ b/data/helm_lite/allenai/olmo-7b/bc29d5c6-b5c8-473b-b69c-054026829089.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/allenai_olmo-7b/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/allenai_olmo-7b/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "OLMo (7B)", + "name": "OLMo 7B", "id": "allenai/olmo-7b", "developer": "allenai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.052, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,12 +47,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -92,12 +104,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -171,13 +192,22 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -221,14 +251,23 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -272,20 +311,29 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -329,24 +377,33 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -389,12 +446,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -438,19 +504,28 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -493,12 +568,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/amazon/nova-lite-v1:0/034168e5-90a0-4816-a9fb-1c2f5e733811.json b/data/helm_lite/amazon/nova-lite-v1_0/ad7e1abd-0263-4971-b37a-b1ca4cb0a8e9.json similarity index 76% rename from data/helm_lite/amazon/nova-lite-v1:0/034168e5-90a0-4816-a9fb-1c2f5e733811.json rename to data/helm_lite/amazon/nova-lite-v1_0/ad7e1abd-0263-4971-b37a-b1ca4cb0a8e9.json index e8381a3f3..084734ba7 100644 --- a/data/helm_lite/amazon/nova-lite-v1:0/034168e5-90a0-4816-a9fb-1c2f5e733811.json +++ b/data/helm_lite/amazon/nova-lite-v1_0/ad7e1abd-0263-4971-b37a-b1ca4cb0a8e9.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/amazon_nova-lite-v1:0/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/amazon_nova-lite-v1:0/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.708, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,12 +47,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -92,12 +104,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -171,13 +192,22 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -221,14 +251,23 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -272,20 +311,29 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -329,24 +377,33 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -390,13 +447,22 @@ } }, "generation_config": { - "stop": "none" + "additional_details": { + "stop": "none" + } } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -440,20 +506,29 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ], - "stop": "none" + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ], + "stop": "none" + } } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -496,12 +571,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -545,13 +629,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/amazon/nova-micro-v1:0/74d72f92-a824-4f3a-93ae-b37e16691ad9.json b/data/helm_lite/amazon/nova-micro-v1_0/4e131240-d66c-4f95-a2c8-7fabbe8b2c25.json similarity index 76% rename from data/helm_lite/amazon/nova-micro-v1:0/74d72f92-a824-4f3a-93ae-b37e16691ad9.json rename to data/helm_lite/amazon/nova-micro-v1_0/4e131240-d66c-4f95-a2c8-7fabbe8b2c25.json index 8fb5d6b37..fb66c7744 100644 --- a/data/helm_lite/amazon/nova-micro-v1:0/74d72f92-a824-4f3a-93ae-b37e16691ad9.json +++ b/data/helm_lite/amazon/nova-micro-v1_0/4e131240-d66c-4f95-a2c8-7fabbe8b2c25.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/amazon_nova-micro-v1:0/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/amazon_nova-micro-v1:0/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.524, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,12 +47,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -92,12 +104,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -171,13 +192,22 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -221,14 +251,23 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -272,20 +311,29 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -329,24 +377,33 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -390,13 +447,22 @@ } }, "generation_config": { - "stop": "none" + "additional_details": { + "stop": "none" + } } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -440,20 +506,29 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ], - "stop": "none" + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ], + "stop": "none" + } } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -496,12 +571,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -545,13 +629,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/amazon/nova-pro-v1:0/f91c057f-5f5d-4183-abf4-54b44e82da2b.json b/data/helm_lite/amazon/nova-pro-v1_0/9ef56d5a-de00-4d89-930c-a4c74211dd78.json similarity index 76% rename from data/helm_lite/amazon/nova-pro-v1:0/f91c057f-5f5d-4183-abf4-54b44e82da2b.json rename to data/helm_lite/amazon/nova-pro-v1_0/9ef56d5a-de00-4d89-930c-a4c74211dd78.json index 52c65584f..c7f9d86e2 100644 --- a/data/helm_lite/amazon/nova-pro-v1:0/f91c057f-5f5d-4183-abf4-54b44e82da2b.json +++ b/data/helm_lite/amazon/nova-pro-v1_0/9ef56d5a-de00-4d89-930c-a4c74211dd78.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/amazon_nova-pro-v1:0/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/amazon_nova-pro-v1:0/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.885, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,12 +47,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -92,12 +104,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -171,13 +192,22 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -221,14 +251,23 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -272,20 +311,29 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -329,24 +377,33 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -390,13 +447,22 @@ } }, "generation_config": { - "stop": "none" + "additional_details": { + "stop": "none" + } } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -440,20 +506,29 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ], - "stop": "none" + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ], + "stop": "none" + } } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -496,12 +571,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -545,13 +629,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/anthropic/claude-2.0/b2b9e87c-76de-4716-8d28-4b13a34c360f.json b/data/helm_lite/anthropic/claude-2.0/5598d3ed-5b37-4aec-b186-0b16c394633b.json similarity index 76% rename from data/helm_lite/anthropic/claude-2.0/b2b9e87c-76de-4716-8d28-4b13a34c360f.json rename to data/helm_lite/anthropic/claude-2.0/5598d3ed-5b37-4aec-b186-0b16c394633b.json index b883ce7c5..ab0989b58 100644 --- a/data/helm_lite/anthropic/claude-2.0/b2b9e87c-76de-4716-8d28-4b13a34c360f.json +++ b/data/helm_lite/anthropic/claude-2.0/5598d3ed-5b37-4aec-b186-0b16c394633b.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/anthropic_claude-2.0/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/anthropic_claude-2.0/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.489, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,12 +47,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -92,12 +104,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -171,13 +192,22 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -221,14 +251,23 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -272,20 +311,29 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -329,24 +377,33 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -389,12 +446,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -438,19 +504,28 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -493,12 +568,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/anthropic/claude-2.1/0bd11df6-a037-4f55-a78a-cc23c34c0958.json b/data/helm_lite/anthropic/claude-2.1/a039c598-3f93-4f59-a8c4-f1ae3d7b241c.json similarity index 76% rename from data/helm_lite/anthropic/claude-2.1/0bd11df6-a037-4f55-a78a-cc23c34c0958.json rename to data/helm_lite/anthropic/claude-2.1/a039c598-3f93-4f59-a8c4-f1ae3d7b241c.json index 388a1840c..2adbb62af 100644 --- a/data/helm_lite/anthropic/claude-2.1/0bd11df6-a037-4f55-a78a-cc23c34c0958.json +++ b/data/helm_lite/anthropic/claude-2.1/a039c598-3f93-4f59-a8c4-f1ae3d7b241c.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/anthropic_claude-2.1/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/anthropic_claude-2.1/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.437, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,12 +47,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -92,12 +104,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -171,13 +192,22 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -221,14 +251,23 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -272,20 +311,29 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -329,24 +377,33 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -389,12 +446,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -438,19 +504,28 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -493,12 +568,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/anthropic/claude-3-5-haiku-20241022/f4061c6a-f82f-4642-a734-f6adb0be7519.json b/data/helm_lite/anthropic/claude-3-5-haiku-20241022/54bac699-aa82-4133-8c10-c6510c2a7f95.json similarity index 76% rename from data/helm_lite/anthropic/claude-3-5-haiku-20241022/f4061c6a-f82f-4642-a734-f6adb0be7519.json rename to data/helm_lite/anthropic/claude-3-5-haiku-20241022/54bac699-aa82-4133-8c10-c6510c2a7f95.json index 231b91f4e..ff757a7ad 100644 --- a/data/helm_lite/anthropic/claude-3-5-haiku-20241022/f4061c6a-f82f-4642-a734-f6adb0be7519.json +++ b/data/helm_lite/anthropic/claude-3-5-haiku-20241022/54bac699-aa82-4133-8c10-c6510c2a7f95.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/anthropic_claude-3-5-haiku-20241022/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/anthropic_claude-3-5-haiku-20241022/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Claude 3.5 Haiku (20241022)", + "name": "Claude 3.5 Haiku 20241022", "id": "anthropic/claude-3-5-haiku-20241022", "developer": "anthropic", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.531, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,12 +47,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -92,12 +104,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -171,13 +192,22 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -221,14 +251,23 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -272,20 +311,29 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -329,24 +377,33 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -390,13 +447,22 @@ } }, "generation_config": { - "stop": "none" + "additional_details": { + "stop": "none" + } } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -440,20 +506,29 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ], - "stop": "none" + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ], + "stop": "none" + } } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -496,12 +571,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -545,13 +629,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/anthropic/claude-3-5-sonnet-20240620/18de115f-32ab-4b2a-b4b2-2ff9553b12f0.json b/data/helm_lite/anthropic/claude-3-5-sonnet-20240620/79b23601-3148-4256-88ce-67e439a87c5b.json similarity index 76% rename from data/helm_lite/anthropic/claude-3-5-sonnet-20240620/18de115f-32ab-4b2a-b4b2-2ff9553b12f0.json rename to data/helm_lite/anthropic/claude-3-5-sonnet-20240620/79b23601-3148-4256-88ce-67e439a87c5b.json index 0ee2e76e5..2c4b0d7d1 100644 --- a/data/helm_lite/anthropic/claude-3-5-sonnet-20240620/18de115f-32ab-4b2a-b4b2-2ff9553b12f0.json +++ b/data/helm_lite/anthropic/claude-3-5-sonnet-20240620/79b23601-3148-4256-88ce-67e439a87c5b.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/anthropic_claude-3-5-sonnet-20240620/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/anthropic_claude-3-5-sonnet-20240620/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Claude 3.5 Sonnet (20240620)", + "name": "Claude 3.5 Sonnet 20240620", "id": "anthropic/claude-3-5-sonnet-20240620", "developer": "anthropic", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.885, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,12 +47,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -92,12 +104,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -171,13 +192,22 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -221,14 +251,23 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -272,20 +311,29 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -329,24 +377,33 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -389,12 +446,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -438,19 +504,28 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -493,12 +568,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/anthropic/claude-3-5-sonnet-20241022/d0cd5626-5b2c-46df-b265-e130a789a0e7.json b/data/helm_lite/anthropic/claude-3-5-sonnet-20241022/e92648e4-75c6-4944-9ec1-880823fefc87.json similarity index 76% rename from data/helm_lite/anthropic/claude-3-5-sonnet-20241022/d0cd5626-5b2c-46df-b265-e130a789a0e7.json rename to data/helm_lite/anthropic/claude-3-5-sonnet-20241022/e92648e4-75c6-4944-9ec1-880823fefc87.json index d816a8a2a..4b9824f13 100644 --- a/data/helm_lite/anthropic/claude-3-5-sonnet-20241022/d0cd5626-5b2c-46df-b265-e130a789a0e7.json +++ b/data/helm_lite/anthropic/claude-3-5-sonnet-20241022/e92648e4-75c6-4944-9ec1-880823fefc87.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/anthropic_claude-3-5-sonnet-20241022/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/anthropic_claude-3-5-sonnet-20241022/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Claude 3.5 Sonnet (20241022)", + "name": "Claude 3.5 Sonnet 20241022", "id": "anthropic/claude-3-5-sonnet-20241022", "developer": "anthropic", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.846, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,12 +47,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -92,12 +104,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -171,13 +192,22 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -221,14 +251,23 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -272,20 +311,29 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -329,24 +377,33 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -389,12 +446,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -438,19 +504,28 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -493,12 +568,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/anthropic/claude-3-haiku-20240307/3eea5b0f-1126-448f-94e5-52a874baa61a.json b/data/helm_lite/anthropic/claude-3-haiku-20240307/449feffd-d2e3-4a08-ad69-b8ad522532ae.json similarity index 76% rename from data/helm_lite/anthropic/claude-3-haiku-20240307/3eea5b0f-1126-448f-94e5-52a874baa61a.json rename to data/helm_lite/anthropic/claude-3-haiku-20240307/449feffd-d2e3-4a08-ad69-b8ad522532ae.json index 66e3c14b8..8eac62865 100644 --- a/data/helm_lite/anthropic/claude-3-haiku-20240307/3eea5b0f-1126-448f-94e5-52a874baa61a.json +++ b/data/helm_lite/anthropic/claude-3-haiku-20240307/449feffd-d2e3-4a08-ad69-b8ad522532ae.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/anthropic_claude-3-haiku-20240307/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/anthropic_claude-3-haiku-20240307/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Claude 3 Haiku (20240307)", + "name": "Claude 3 Haiku 20240307", "id": "anthropic/claude-3-haiku-20240307", "developer": "anthropic", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.263, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,12 +47,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -92,12 +104,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -171,13 +192,22 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -221,14 +251,23 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -272,20 +311,29 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -329,24 +377,33 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -389,12 +446,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -438,19 +504,28 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -493,12 +568,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/anthropic/claude-3-opus-20240229/9fa44303-4699-47f2-9777-0c118e36d87e.json b/data/helm_lite/anthropic/claude-3-opus-20240229/d297b253-0f4f-4caf-864b-9f457ab589da.json similarity index 76% rename from data/helm_lite/anthropic/claude-3-opus-20240229/9fa44303-4699-47f2-9777-0c118e36d87e.json rename to data/helm_lite/anthropic/claude-3-opus-20240229/d297b253-0f4f-4caf-864b-9f457ab589da.json index 27c9ec758..d590c786e 100644 --- a/data/helm_lite/anthropic/claude-3-opus-20240229/9fa44303-4699-47f2-9777-0c118e36d87e.json +++ b/data/helm_lite/anthropic/claude-3-opus-20240229/d297b253-0f4f-4caf-864b-9f457ab589da.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/anthropic_claude-3-opus-20240229/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/anthropic_claude-3-opus-20240229/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Claude 3 Opus (20240229)", + "name": "Claude 3 Opus 20240229", "id": "anthropic/claude-3-opus-20240229", "developer": "anthropic", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.683, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,12 +47,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -92,12 +104,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -171,13 +192,22 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -221,14 +251,23 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -272,20 +311,29 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -329,24 +377,33 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -389,12 +446,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -438,19 +504,28 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -493,12 +568,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/anthropic/claude-3-sonnet-20240229/a2d019d6-52bf-439f-90f0-74583928e5c0.json b/data/helm_lite/anthropic/claude-3-sonnet-20240229/d7a7e038-0985-4ee2-a549-0906b3aa8cc5.json similarity index 76% rename from data/helm_lite/anthropic/claude-3-sonnet-20240229/a2d019d6-52bf-439f-90f0-74583928e5c0.json rename to data/helm_lite/anthropic/claude-3-sonnet-20240229/d7a7e038-0985-4ee2-a549-0906b3aa8cc5.json index 3cbea3718..90baddbf7 100644 --- a/data/helm_lite/anthropic/claude-3-sonnet-20240229/a2d019d6-52bf-439f-90f0-74583928e5c0.json +++ b/data/helm_lite/anthropic/claude-3-sonnet-20240229/d7a7e038-0985-4ee2-a549-0906b3aa8cc5.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/anthropic_claude-3-sonnet-20240229/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/anthropic_claude-3-sonnet-20240229/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Claude 3 Sonnet (20240229)", + "name": "Claude 3 Sonnet 20240229", "id": "anthropic/claude-3-sonnet-20240229", "developer": "anthropic", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.377, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,12 +47,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -92,12 +104,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -171,13 +192,22 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -221,14 +251,23 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -272,20 +311,29 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -329,24 +377,33 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -389,12 +446,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -438,19 +504,28 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -493,12 +568,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/anthropic/claude-instant-1.2/0f884c98-ea5e-4409-81e2-40aa5c84f99d.json b/data/helm_lite/anthropic/claude-instant-1.2/cb409208-034d-42fd-acce-ab5cc4227383.json similarity index 76% rename from data/helm_lite/anthropic/claude-instant-1.2/0f884c98-ea5e-4409-81e2-40aa5c84f99d.json rename to data/helm_lite/anthropic/claude-instant-1.2/cb409208-034d-42fd-acce-ab5cc4227383.json index a1592f60e..c3ca60cb8 100644 --- a/data/helm_lite/anthropic/claude-instant-1.2/0f884c98-ea5e-4409-81e2-40aa5c84f99d.json +++ b/data/helm_lite/anthropic/claude-instant-1.2/cb409208-034d-42fd-acce-ab5cc4227383.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/anthropic_claude-instant-1.2/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/anthropic_claude-instant-1.2/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.399, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,12 +47,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -92,12 +104,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -171,13 +192,22 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -221,14 +251,23 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -272,20 +311,29 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -329,24 +377,33 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -389,12 +446,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -438,19 +504,28 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -493,12 +568,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/anthropic/claude-v1.3/2e1efde7-6f64-40b8-86ce-8cc29c6a78bf.json b/data/helm_lite/anthropic/claude-v1.3/b2572ef8-446a-45b4-b557-45736418753b.json similarity index 76% rename from data/helm_lite/anthropic/claude-v1.3/2e1efde7-6f64-40b8-86ce-8cc29c6a78bf.json rename to data/helm_lite/anthropic/claude-v1.3/b2572ef8-446a-45b4-b557-45736418753b.json index e73713e6a..da3e6b3b3 100644 --- a/data/helm_lite/anthropic/claude-v1.3/2e1efde7-6f64-40b8-86ce-8cc29c6a78bf.json +++ b/data/helm_lite/anthropic/claude-v1.3/b2572ef8-446a-45b4-b557-45736418753b.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/anthropic_claude-v1.3/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/anthropic_claude-v1.3/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.518, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,12 +47,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -92,12 +104,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -171,13 +192,22 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -221,14 +251,23 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -272,20 +311,29 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -329,24 +377,33 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -389,12 +446,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -438,19 +504,28 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -493,12 +568,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/cohere/command-light/8c312031-5da7-4816-8207-056fe1bc161d.json b/data/helm_lite/cohere/command-light/70d85516-b710-4b27-b664-03a6a822773b.json similarity index 76% rename from data/helm_lite/cohere/command-light/8c312031-5da7-4816-8207-056fe1bc161d.json rename to data/helm_lite/cohere/command-light/70d85516-b710-4b27-b664-03a6a822773b.json index aabe52512..a431f3338 100644 --- a/data/helm_lite/cohere/command-light/8c312031-5da7-4816-8207-056fe1bc161d.json +++ b/data/helm_lite/cohere/command-light/70d85516-b710-4b27-b664-03a6a822773b.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/cohere_command-light/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/cohere_command-light/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.105, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,12 +47,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -92,12 +104,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -171,13 +192,22 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -221,14 +251,23 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -272,20 +311,29 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -329,24 +377,33 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -389,12 +446,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -438,19 +504,28 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -493,12 +568,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/cohere/command-r-plus/71c0558f-7b56-40ea-a1be-2749b88758c7.json b/data/helm_lite/cohere/command-r-plus/a8208df4-eb37-47d2-8845-f821e80e9858.json similarity index 76% rename from data/helm_lite/cohere/command-r-plus/71c0558f-7b56-40ea-a1be-2749b88758c7.json rename to data/helm_lite/cohere/command-r-plus/a8208df4-eb37-47d2-8845-f821e80e9858.json index 288bdd798..d0f464767 100644 --- a/data/helm_lite/cohere/command-r-plus/71c0558f-7b56-40ea-a1be-2749b88758c7.json +++ b/data/helm_lite/cohere/command-r-plus/a8208df4-eb37-47d2-8845-f821e80e9858.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/cohere_command-r-plus/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/cohere_command-r-plus/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.441, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,12 +47,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -92,12 +104,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -171,13 +192,22 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -221,14 +251,23 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -272,20 +311,29 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -329,24 +377,33 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -390,13 +447,22 @@ } }, "generation_config": { - "stop": "none" + "additional_details": { + "stop": "none" + } } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -440,19 +506,28 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -495,12 +570,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -544,13 +628,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/cohere/command-r/d1330068-2c16-450e-8ce5-1d05f5e842d9.json b/data/helm_lite/cohere/command-r/22cde248-40ab-43b0-a408-6d8b84692f22.json similarity index 76% rename from data/helm_lite/cohere/command-r/d1330068-2c16-450e-8ce5-1d05f5e842d9.json rename to data/helm_lite/cohere/command-r/22cde248-40ab-43b0-a408-6d8b84692f22.json index 33b212443..51821d155 100644 --- a/data/helm_lite/cohere/command-r/d1330068-2c16-450e-8ce5-1d05f5e842d9.json +++ b/data/helm_lite/cohere/command-r/22cde248-40ab-43b0-a408-6d8b84692f22.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/cohere_command-r/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/cohere_command-r/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.299, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,12 +47,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -92,12 +104,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -171,13 +192,22 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -221,14 +251,23 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -272,20 +311,29 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -329,24 +377,33 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -390,13 +447,22 @@ } }, "generation_config": { - "stop": "none" + "additional_details": { + "stop": "none" + } } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -440,19 +506,28 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -495,12 +570,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -544,13 +628,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/cohere/command/dec04718-1ae9-4e4b-92da-01d789424f69.json b/data/helm_lite/cohere/command/b0f85fd8-cfab-4fe0-8b36-7ea97e99a023.json similarity index 76% rename from data/helm_lite/cohere/command/dec04718-1ae9-4e4b-92da-01d789424f69.json rename to data/helm_lite/cohere/command/b0f85fd8-cfab-4fe0-8b36-7ea97e99a023.json index b95f59ea4..488fa54b9 100644 --- a/data/helm_lite/cohere/command/dec04718-1ae9-4e4b-92da-01d789424f69.json +++ b/data/helm_lite/cohere/command/b0f85fd8-cfab-4fe0-8b36-7ea97e99a023.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/cohere_command/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/cohere_command/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.327, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,12 +47,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -92,12 +104,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -171,13 +192,22 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -221,14 +251,23 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -272,20 +311,29 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -329,24 +377,33 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -389,12 +446,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -438,19 +504,28 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -493,12 +568,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/databricks/dbrx-instruct/ba50499a-6cfd-4f04-aab5-c2122202cc74.json b/data/helm_lite/databricks/dbrx-instruct/ec27e9fc-166d-454b-90c7-2eb8195ae2e2.json similarity index 76% rename from data/helm_lite/databricks/dbrx-instruct/ba50499a-6cfd-4f04-aab5-c2122202cc74.json rename to data/helm_lite/databricks/dbrx-instruct/ec27e9fc-166d-454b-90c7-2eb8195ae2e2.json index 7cf9a9388..9dc0aa32d 100644 --- a/data/helm_lite/databricks/dbrx-instruct/ba50499a-6cfd-4f04-aab5-c2122202cc74.json +++ b/data/helm_lite/databricks/dbrx-instruct/ec27e9fc-166d-454b-90c7-2eb8195ae2e2.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/databricks_dbrx-instruct/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/databricks_dbrx-instruct/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.289, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,12 +47,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -92,12 +104,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -171,13 +192,22 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -221,14 +251,23 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -272,20 +311,29 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -329,24 +377,33 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -389,12 +446,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -438,19 +504,28 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -493,12 +568,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/deepseek-ai/deepseek-llm-67b-chat/35bf65f3-d585-4fb9-8c9d-6b1e1dccb569.json b/data/helm_lite/deepseek-ai/deepseek-llm-67b-chat/8721a15b-9102-4b1a-bde8-e5371f00f1b5.json similarity index 76% rename from data/helm_lite/deepseek-ai/deepseek-llm-67b-chat/35bf65f3-d585-4fb9-8c9d-6b1e1dccb569.json rename to data/helm_lite/deepseek-ai/deepseek-llm-67b-chat/8721a15b-9102-4b1a-bde8-e5371f00f1b5.json index bf2730468..201ddf6e5 100644 --- a/data/helm_lite/deepseek-ai/deepseek-llm-67b-chat/35bf65f3-d585-4fb9-8c9d-6b1e1dccb569.json +++ b/data/helm_lite/deepseek-ai/deepseek-llm-67b-chat/8721a15b-9102-4b1a-bde8-e5371f00f1b5.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/deepseek-ai_deepseek-llm-67b-chat/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/deepseek-ai_deepseek-llm-67b-chat/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "DeepSeek LLM Chat (67B)", + "name": "DeepSeek LLM Chat 67B", "id": "deepseek-ai/deepseek-llm-67b-chat", "developer": "deepseek-ai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.488, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,12 +47,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -92,12 +104,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -171,13 +192,22 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -221,14 +251,23 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -272,20 +311,29 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -329,24 +377,33 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -389,12 +446,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -438,19 +504,28 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -493,12 +568,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/deepseek-ai/deepseek-v3/d11c2c6d-b5d0-4c40-bd8e-d6bd194aadf5.json b/data/helm_lite/deepseek-ai/deepseek-v3/23b3a30c-8aa3-4684-be54-adae003720fc.json similarity index 76% rename from data/helm_lite/deepseek-ai/deepseek-v3/d11c2c6d-b5d0-4c40-bd8e-d6bd194aadf5.json rename to data/helm_lite/deepseek-ai/deepseek-v3/23b3a30c-8aa3-4684-be54-adae003720fc.json index e07480be1..b5f8e240f 100644 --- a/data/helm_lite/deepseek-ai/deepseek-v3/d11c2c6d-b5d0-4c40-bd8e-d6bd194aadf5.json +++ b/data/helm_lite/deepseek-ai/deepseek-v3/23b3a30c-8aa3-4684-be54-adae003720fc.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/deepseek-ai_deepseek-v3/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/deepseek-ai_deepseek-v3/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.908, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,12 +47,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -92,12 +104,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -171,13 +192,22 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -221,14 +251,23 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -272,20 +311,29 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -329,24 +377,33 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -390,13 +447,22 @@ } }, "generation_config": { - "stop": "none" + "additional_details": { + "stop": "none" + } } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -440,19 +506,28 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -495,12 +570,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -544,13 +628,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/google/gemini-1.0-pro-002/1e98157d-49e6-4d66-ae21-a95d419c47e3.json b/data/helm_lite/google/gemini-1.0-pro-002/7022c444-d6b8-4374-be0c-14835e5fd281.json similarity index 76% rename from data/helm_lite/google/gemini-1.0-pro-002/1e98157d-49e6-4d66-ae21-a95d419c47e3.json rename to data/helm_lite/google/gemini-1.0-pro-002/7022c444-d6b8-4374-be0c-14835e5fd281.json index eefe2f954..eabdc0bbd 100644 --- a/data/helm_lite/google/gemini-1.0-pro-002/1e98157d-49e6-4d66-ae21-a95d419c47e3.json +++ b/data/helm_lite/google/gemini-1.0-pro-002/7022c444-d6b8-4374-be0c-14835e5fd281.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/google_gemini-1.0-pro-002/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/google_gemini-1.0-pro-002/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Gemini 1.0 Pro (002)", + "name": "Gemini 1.0 Pro 002", "id": "google/gemini-1.0-pro-002", "developer": "google", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.422, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,12 +47,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -92,12 +104,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -171,13 +192,22 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -221,14 +251,23 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -272,20 +311,29 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -329,24 +377,33 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -390,13 +447,22 @@ } }, "generation_config": { - "stop": "none" + "additional_details": { + "stop": "none" + } } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -440,19 +506,28 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -495,12 +570,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -544,13 +628,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/google/gemini-1.5-flash-001/e92bce18-690a-44eb-8bc5-28e9303473bb.json b/data/helm_lite/google/gemini-1.5-flash-001/bc93fd3d-b6cc-4c03-8c71-d8f1f5ef5957.json similarity index 76% rename from data/helm_lite/google/gemini-1.5-flash-001/e92bce18-690a-44eb-8bc5-28e9303473bb.json rename to data/helm_lite/google/gemini-1.5-flash-001/bc93fd3d-b6cc-4c03-8c71-d8f1f5ef5957.json index e10645540..991b81669 100644 --- a/data/helm_lite/google/gemini-1.5-flash-001/e92bce18-690a-44eb-8bc5-28e9303473bb.json +++ b/data/helm_lite/google/gemini-1.5-flash-001/bc93fd3d-b6cc-4c03-8c71-d8f1f5ef5957.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/google_gemini-1.5-flash-001/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/google_gemini-1.5-flash-001/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Gemini 1.5 Flash (001)", + "name": "Gemini 1.5 Flash 001", "id": "google/gemini-1.5-flash-001", "developer": "google", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.667, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,12 +47,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -92,12 +104,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -171,13 +192,22 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -221,14 +251,23 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -272,20 +311,29 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -329,24 +377,33 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -390,13 +447,22 @@ } }, "generation_config": { - "stop": "none" + "additional_details": { + "stop": "none" + } } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -440,19 +506,28 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -495,12 +570,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -544,13 +628,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/google/gemini-1.5-flash-002/3a54f656-78bd-4fbb-97c5-ae12ed6f888c.json b/data/helm_lite/google/gemini-1.5-flash-002/bc7b0ecf-f2a9-44c2-8949-bbfe762f1b72.json similarity index 76% rename from data/helm_lite/google/gemini-1.5-flash-002/3a54f656-78bd-4fbb-97c5-ae12ed6f888c.json rename to data/helm_lite/google/gemini-1.5-flash-002/bc7b0ecf-f2a9-44c2-8949-bbfe762f1b72.json index 8e4eb067b..725c639a2 100644 --- a/data/helm_lite/google/gemini-1.5-flash-002/3a54f656-78bd-4fbb-97c5-ae12ed6f888c.json +++ b/data/helm_lite/google/gemini-1.5-flash-002/bc7b0ecf-f2a9-44c2-8949-bbfe762f1b72.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/google_gemini-1.5-flash-002/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/google_gemini-1.5-flash-002/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Gemini 1.5 Flash (002)", + "name": "Gemini 1.5 Flash 002", "id": "google/gemini-1.5-flash-002", "developer": "google", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.573, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,12 +47,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -92,12 +104,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -171,13 +192,22 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -221,14 +251,23 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -272,20 +311,29 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -329,24 +377,33 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -389,12 +446,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -438,19 +504,28 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -493,12 +568,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/google/gemini-1.5-pro-001/b1ecfc78-f59e-437f-b163-9253ad092799.json b/data/helm_lite/google/gemini-1.5-pro-001/527418d0-2591-43c9-b639-17328292b110.json similarity index 76% rename from data/helm_lite/google/gemini-1.5-pro-001/b1ecfc78-f59e-437f-b163-9253ad092799.json rename to data/helm_lite/google/gemini-1.5-pro-001/527418d0-2591-43c9-b639-17328292b110.json index 38c3a236a..8b7eab026 100644 --- a/data/helm_lite/google/gemini-1.5-pro-001/b1ecfc78-f59e-437f-b163-9253ad092799.json +++ b/data/helm_lite/google/gemini-1.5-pro-001/527418d0-2591-43c9-b639-17328292b110.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/google_gemini-1.5-pro-001/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/google_gemini-1.5-pro-001/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Gemini 1.5 Pro (001)", + "name": "Gemini 1.5 Pro 001", "id": "google/gemini-1.5-pro-001", "developer": "google", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.739, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,12 +47,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -92,12 +104,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -171,13 +192,22 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -221,14 +251,23 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -272,20 +311,29 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -329,24 +377,33 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -390,13 +447,22 @@ } }, "generation_config": { - "stop": "none" + "additional_details": { + "stop": "none" + } } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -440,19 +506,28 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -495,12 +570,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -544,13 +628,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/google/gemini-1.5-pro-002/04415dda-306f-420c-8af8-54336368fc40.json b/data/helm_lite/google/gemini-1.5-pro-002/8ddc465f-4f2d-4213-81c4-70b584d48047.json similarity index 76% rename from data/helm_lite/google/gemini-1.5-pro-002/04415dda-306f-420c-8af8-54336368fc40.json rename to data/helm_lite/google/gemini-1.5-pro-002/8ddc465f-4f2d-4213-81c4-70b584d48047.json index cada735aa..ebd3081fb 100644 --- a/data/helm_lite/google/gemini-1.5-pro-002/04415dda-306f-420c-8af8-54336368fc40.json +++ b/data/helm_lite/google/gemini-1.5-pro-002/8ddc465f-4f2d-4213-81c4-70b584d48047.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/google_gemini-1.5-pro-002/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/google_gemini-1.5-pro-002/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Gemini 1.5 Pro (002)", + "name": "Gemini 1.5 Pro 002", "id": "google/gemini-1.5-pro-002", "developer": "google", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.842, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,12 +47,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -92,12 +104,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -171,13 +192,22 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -221,14 +251,23 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -272,20 +311,29 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -329,24 +377,33 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -389,12 +446,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -438,19 +504,28 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -493,12 +568,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/google/gemini-2.0-flash-exp/ef8afc84-3f35-4d93-ab2e-0f07f25b9dde.json b/data/helm_lite/google/gemini-2.0-flash-exp/eca63d17-7fc2-4722-8bb3-0be99a257100.json similarity index 76% rename from data/helm_lite/google/gemini-2.0-flash-exp/ef8afc84-3f35-4d93-ab2e-0f07f25b9dde.json rename to data/helm_lite/google/gemini-2.0-flash-exp/eca63d17-7fc2-4722-8bb3-0be99a257100.json index 1487ce304..b96b71c0c 100644 --- a/data/helm_lite/google/gemini-2.0-flash-exp/ef8afc84-3f35-4d93-ab2e-0f07f25b9dde.json +++ b/data/helm_lite/google/gemini-2.0-flash-exp/eca63d17-7fc2-4722-8bb3-0be99a257100.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/google_gemini-2.0-flash-exp/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/google_gemini-2.0-flash-exp/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Gemini 2.0 Flash (Experimental)", + "name": "Gemini 2.0 Flash Experimental", "id": "google/gemini-2.0-flash-exp", "developer": "google", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.813, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,12 +47,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -92,12 +104,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -171,13 +192,22 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -221,14 +251,23 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -272,20 +311,29 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -329,24 +377,33 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -390,13 +447,22 @@ } }, "generation_config": { - "stop": "none" + "additional_details": { + "stop": "none" + } } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -440,20 +506,29 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ], - "stop": "none" + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ], + "stop": "none" + } } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -496,12 +571,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -545,13 +629,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/google/gemma-2-27b-it/5eb1e8ba-361a-4b37-b865-7ae6f7ccde80.json b/data/helm_lite/google/gemma-2-27b-it/e40a10b3-e682-4715-b2ee-4efcae050a58.json similarity index 76% rename from data/helm_lite/google/gemma-2-27b-it/5eb1e8ba-361a-4b37-b865-7ae6f7ccde80.json rename to data/helm_lite/google/gemma-2-27b-it/e40a10b3-e682-4715-b2ee-4efcae050a58.json index 29456a114..ea107cc9e 100644 --- a/data/helm_lite/google/gemma-2-27b-it/5eb1e8ba-361a-4b37-b865-7ae6f7ccde80.json +++ b/data/helm_lite/google/gemma-2-27b-it/e40a10b3-e682-4715-b2ee-4efcae050a58.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/google_gemma-2-27b-it/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/google_gemma-2-27b-it/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Gemma 2 Instruct (27B)", + "name": "Gemma 2 Instruct 27B", "id": "google/gemma-2-27b-it", "developer": "google", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.675, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,12 +47,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -92,12 +104,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -171,13 +192,22 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -221,14 +251,23 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -272,20 +311,29 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -329,24 +377,33 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -389,12 +446,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -438,19 +504,28 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -493,12 +568,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/google/gemma-2-9b-it/63af45df-c46d-46df-8f3e-592181ce6a7a.json b/data/helm_lite/google/gemma-2-9b-it/56425fda-a1f4-40cc-82f7-6a56ab2ccfaf.json similarity index 76% rename from data/helm_lite/google/gemma-2-9b-it/63af45df-c46d-46df-8f3e-592181ce6a7a.json rename to data/helm_lite/google/gemma-2-9b-it/56425fda-a1f4-40cc-82f7-6a56ab2ccfaf.json index 75457f70d..1488d6604 100644 --- a/data/helm_lite/google/gemma-2-9b-it/63af45df-c46d-46df-8f3e-592181ce6a7a.json +++ b/data/helm_lite/google/gemma-2-9b-it/56425fda-a1f4-40cc-82f7-6a56ab2ccfaf.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/google_gemma-2-9b-it/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/google_gemma-2-9b-it/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Gemma 2 Instruct (9B)", + "name": "Gemma 2 Instruct 9B", "id": "google/gemma-2-9b-it", "developer": "google", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.562, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,12 +47,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -92,12 +104,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -171,13 +192,22 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -221,14 +251,23 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -272,20 +311,29 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -329,24 +377,33 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -390,13 +447,22 @@ } }, "generation_config": { - "stop": "none" + "additional_details": { + "stop": "none" + } } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -440,19 +506,28 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -495,12 +570,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -544,13 +628,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/google/gemma-7b/aad88f1f-6047-45e7-8b0f-d5deac20be68.json b/data/helm_lite/google/gemma-7b/f47ca10d-cd45-485e-b9cf-0c6592d63656.json similarity index 76% rename from data/helm_lite/google/gemma-7b/aad88f1f-6047-45e7-8b0f-d5deac20be68.json rename to data/helm_lite/google/gemma-7b/f47ca10d-cd45-485e-b9cf-0c6592d63656.json index dabc86d10..810e32965 100644 --- a/data/helm_lite/google/gemma-7b/aad88f1f-6047-45e7-8b0f-d5deac20be68.json +++ b/data/helm_lite/google/gemma-7b/f47ca10d-cd45-485e-b9cf-0c6592d63656.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/google_gemma-7b/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/google_gemma-7b/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Gemma (7B)", + "name": "Gemma 7B", "id": "google/gemma-7b", "developer": "google", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.336, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,12 +47,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -92,12 +104,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -171,13 +192,22 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -221,14 +251,23 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -272,20 +311,29 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -329,24 +377,33 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -389,12 +446,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -438,19 +504,28 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -493,12 +568,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/google/text-bison@001/f0bde02f-bde8-40c5-abc5-9cb4a25a55ce.json b/data/helm_lite/google/text-bison@001/7f0e318e-31bf-4044-bffb-357c1238d4fd.json similarity index 76% rename from data/helm_lite/google/text-bison@001/f0bde02f-bde8-40c5-abc5-9cb4a25a55ce.json rename to data/helm_lite/google/text-bison@001/7f0e318e-31bf-4044-bffb-357c1238d4fd.json index 9c9727ed0..30d0e3442 100644 --- a/data/helm_lite/google/text-bison@001/f0bde02f-bde8-40c5-abc5-9cb4a25a55ce.json +++ b/data/helm_lite/google/text-bison@001/7f0e318e-31bf-4044-bffb-357c1238d4fd.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/google_text-bison@001/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/google_text-bison@001/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "PaLM-2 (Bison)", + "name": "PaLM-2 Bison", "id": "google/text-bison@001", "developer": "google", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.526, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,12 +47,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -92,12 +104,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -171,13 +192,22 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -221,14 +251,23 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -272,20 +311,29 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -329,24 +377,33 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -389,12 +446,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -438,19 +504,28 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -493,12 +568,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/google/text-unicorn@001/35f70e20-8a08-4f7c-b822-5238337d4177.json b/data/helm_lite/google/text-unicorn@001/818d6d72-0b5c-4fcf-b808-1d186223301e.json similarity index 76% rename from data/helm_lite/google/text-unicorn@001/35f70e20-8a08-4f7c-b822-5238337d4177.json rename to data/helm_lite/google/text-unicorn@001/818d6d72-0b5c-4fcf-b808-1d186223301e.json index 2e152e4a7..d5841340f 100644 --- a/data/helm_lite/google/text-unicorn@001/35f70e20-8a08-4f7c-b822-5238337d4177.json +++ b/data/helm_lite/google/text-unicorn@001/818d6d72-0b5c-4fcf-b808-1d186223301e.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/google_text-unicorn@001/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/google_text-unicorn@001/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "PaLM-2 (Unicorn)", + "name": "PaLM-2 Unicorn", "id": "google/text-unicorn@001", "developer": "google", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.644, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,12 +47,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -92,12 +104,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -171,13 +192,22 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -221,14 +251,23 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -272,20 +311,29 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -329,24 +377,33 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -389,12 +446,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -438,19 +504,28 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -493,12 +568,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/meta/llama-2-13b/e19c56fc-5f6c-48a0-874a-97665283e6f0.json b/data/helm_lite/meta/llama-2-13b/f09b853b-dbbc-4252-a0f0-a2c45c29f670.json similarity index 76% rename from data/helm_lite/meta/llama-2-13b/e19c56fc-5f6c-48a0-874a-97665283e6f0.json rename to data/helm_lite/meta/llama-2-13b/f09b853b-dbbc-4252-a0f0-a2c45c29f670.json index a5b394c06..079c14180 100644 --- a/data/helm_lite/meta/llama-2-13b/e19c56fc-5f6c-48a0-874a-97665283e6f0.json +++ b/data/helm_lite/meta/llama-2-13b/f09b853b-dbbc-4252-a0f0-a2c45c29f670.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/meta_llama-2-13b/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/meta_llama-2-13b/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Llama 2 (13B)", + "name": "Llama 2 13B", "id": "meta/llama-2-13b", "developer": "meta", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.233, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,12 +47,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -92,12 +104,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -171,13 +192,22 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -221,14 +251,23 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -272,20 +311,29 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -329,24 +377,33 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -389,12 +446,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -438,19 +504,28 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -493,12 +568,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/meta/llama-2-70b/98a0c9bb-9679-4cc5-85b8-8801dbb965de.json b/data/helm_lite/meta/llama-2-70b/f84d3cf5-0f7d-481e-b782-a5c98cf9faec.json similarity index 76% rename from data/helm_lite/meta/llama-2-70b/98a0c9bb-9679-4cc5-85b8-8801dbb965de.json rename to data/helm_lite/meta/llama-2-70b/f84d3cf5-0f7d-481e-b782-a5c98cf9faec.json index cf4407980..8faa07285 100644 --- a/data/helm_lite/meta/llama-2-70b/98a0c9bb-9679-4cc5-85b8-8801dbb965de.json +++ b/data/helm_lite/meta/llama-2-70b/f84d3cf5-0f7d-481e-b782-a5c98cf9faec.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/meta_llama-2-70b/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/meta_llama-2-70b/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Llama 2 (70B)", + "name": "Llama 2 70B", "id": "meta/llama-2-70b", "developer": "meta", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.482, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,12 +47,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -92,12 +104,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -171,13 +192,22 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -221,14 +251,23 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -272,20 +311,29 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -329,24 +377,33 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -389,12 +446,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -438,19 +504,28 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -493,12 +568,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/meta/llama-2-7b/fad21bfe-048f-412c-b3fd-9b43d276b2a2.json b/data/helm_lite/meta/llama-2-7b/83c6a723-87a0-43d4-968e-86d186578e9e.json similarity index 76% rename from data/helm_lite/meta/llama-2-7b/fad21bfe-048f-412c-b3fd-9b43d276b2a2.json rename to data/helm_lite/meta/llama-2-7b/83c6a723-87a0-43d4-968e-86d186578e9e.json index 3b18db79e..bb2c02730 100644 --- a/data/helm_lite/meta/llama-2-7b/fad21bfe-048f-412c-b3fd-9b43d276b2a2.json +++ b/data/helm_lite/meta/llama-2-7b/83c6a723-87a0-43d4-968e-86d186578e9e.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/meta_llama-2-7b/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/meta_llama-2-7b/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Llama 2 (7B)", + "name": "Llama 2 7B", "id": "meta/llama-2-7b", "developer": "meta", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.152, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,12 +47,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -92,12 +104,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -171,13 +192,22 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -221,14 +251,23 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -272,20 +311,29 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -329,24 +377,33 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -389,12 +446,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -438,19 +504,28 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -493,12 +568,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/meta/llama-3-70b/b1e28406-d88d-4acd-a268-7baebc9b565a.json b/data/helm_lite/meta/llama-3-70b/daaf221b-1759-4619-91fb-938e81975787.json similarity index 76% rename from data/helm_lite/meta/llama-3-70b/b1e28406-d88d-4acd-a268-7baebc9b565a.json rename to data/helm_lite/meta/llama-3-70b/daaf221b-1759-4619-91fb-938e81975787.json index 90d04801d..876850010 100644 --- a/data/helm_lite/meta/llama-3-70b/b1e28406-d88d-4acd-a268-7baebc9b565a.json +++ b/data/helm_lite/meta/llama-3-70b/daaf221b-1759-4619-91fb-938e81975787.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/meta_llama-3-70b/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/meta_llama-3-70b/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Llama 3 (70B)", + "name": "Llama 3 70B", "id": "meta/llama-3-70b", "developer": "meta", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.793, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,12 +47,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -92,12 +104,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -171,13 +192,22 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -221,14 +251,23 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -272,20 +311,29 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -329,24 +377,33 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -389,12 +446,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -438,19 +504,28 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -493,12 +568,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/meta/llama-3-8b/60696eaf-669d-49bf-bebe-6cd171522faa.json b/data/helm_lite/meta/llama-3-8b/6b528e49-fec4-4b63-bfb5-1b0df021f3c2.json similarity index 76% rename from data/helm_lite/meta/llama-3-8b/60696eaf-669d-49bf-bebe-6cd171522faa.json rename to data/helm_lite/meta/llama-3-8b/6b528e49-fec4-4b63-bfb5-1b0df021f3c2.json index 0e3ff704d..87ab72524 100644 --- a/data/helm_lite/meta/llama-3-8b/60696eaf-669d-49bf-bebe-6cd171522faa.json +++ b/data/helm_lite/meta/llama-3-8b/6b528e49-fec4-4b63-bfb5-1b0df021f3c2.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/meta_llama-3-8b/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/meta_llama-3-8b/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Llama 3 (8B)", + "name": "Llama 3 8B", "id": "meta/llama-3-8b", "developer": "meta", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.387, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,12 +47,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -92,12 +104,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -171,13 +192,22 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -221,14 +251,23 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -272,20 +311,29 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -329,24 +377,33 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -389,12 +446,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -438,19 +504,28 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -493,12 +568,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/meta/llama-3.1-405b-instruct-turbo/ad2fdc9f-20fd-4ad6-8cea-0380c297b725.json b/data/helm_lite/meta/llama-3.1-405b-instruct-turbo/1043b815-b247-4444-bf8c-0b92b793c57f.json similarity index 76% rename from data/helm_lite/meta/llama-3.1-405b-instruct-turbo/ad2fdc9f-20fd-4ad6-8cea-0380c297b725.json rename to data/helm_lite/meta/llama-3.1-405b-instruct-turbo/1043b815-b247-4444-bf8c-0b92b793c57f.json index 8311edd73..0bc6225d5 100644 --- a/data/helm_lite/meta/llama-3.1-405b-instruct-turbo/ad2fdc9f-20fd-4ad6-8cea-0380c297b725.json +++ b/data/helm_lite/meta/llama-3.1-405b-instruct-turbo/1043b815-b247-4444-bf8c-0b92b793c57f.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/meta_llama-3.1-405b-instruct-turbo/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/meta_llama-3.1-405b-instruct-turbo/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Llama 3.1 Instruct Turbo (405B)", + "name": "Llama 3.1 Instruct Turbo 405B", "id": "meta/llama-3.1-405b-instruct-turbo", "developer": "meta", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.854, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,12 +47,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -92,12 +104,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -171,13 +192,22 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -221,14 +251,23 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -272,20 +311,29 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -329,24 +377,33 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -390,13 +447,22 @@ } }, "generation_config": { - "stop": "none" + "additional_details": { + "stop": "none" + } } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -440,19 +506,28 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -495,12 +570,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -544,13 +628,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/meta/llama-3.1-70b-instruct-turbo/c3b72d96-9af5-4e32-b420-e85a88e82e5a.json b/data/helm_lite/meta/llama-3.1-70b-instruct-turbo/28bc8f72-7b91-47fc-b10e-cd268cbc1caf.json similarity index 76% rename from data/helm_lite/meta/llama-3.1-70b-instruct-turbo/c3b72d96-9af5-4e32-b420-e85a88e82e5a.json rename to data/helm_lite/meta/llama-3.1-70b-instruct-turbo/28bc8f72-7b91-47fc-b10e-cd268cbc1caf.json index 3e59bea75..d57074cb2 100644 --- a/data/helm_lite/meta/llama-3.1-70b-instruct-turbo/c3b72d96-9af5-4e32-b420-e85a88e82e5a.json +++ b/data/helm_lite/meta/llama-3.1-70b-instruct-turbo/28bc8f72-7b91-47fc-b10e-cd268cbc1caf.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/meta_llama-3.1-70b-instruct-turbo/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/meta_llama-3.1-70b-instruct-turbo/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Llama 3.1 Instruct Turbo (70B)", + "name": "Llama 3.1 Instruct Turbo 70B", "id": "meta/llama-3.1-70b-instruct-turbo", "developer": "meta", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.808, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,12 +47,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -92,12 +104,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -171,13 +192,22 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -221,14 +251,23 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -272,20 +311,29 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -329,24 +377,33 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -390,13 +447,22 @@ } }, "generation_config": { - "stop": "none" + "additional_details": { + "stop": "none" + } } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -440,19 +506,28 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -495,12 +570,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -544,13 +628,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/meta/llama-3.1-8b-instruct-turbo/57b2177d-0232-41ca-aa3a-b2ecb7af7586.json b/data/helm_lite/meta/llama-3.1-8b-instruct-turbo/73dedd31-7d40-4ee6-994d-00eb7d656597.json similarity index 76% rename from data/helm_lite/meta/llama-3.1-8b-instruct-turbo/57b2177d-0232-41ca-aa3a-b2ecb7af7586.json rename to data/helm_lite/meta/llama-3.1-8b-instruct-turbo/73dedd31-7d40-4ee6-994d-00eb7d656597.json index 300f5dbb2..198d81cd2 100644 --- a/data/helm_lite/meta/llama-3.1-8b-instruct-turbo/57b2177d-0232-41ca-aa3a-b2ecb7af7586.json +++ b/data/helm_lite/meta/llama-3.1-8b-instruct-turbo/73dedd31-7d40-4ee6-994d-00eb7d656597.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/meta_llama-3.1-8b-instruct-turbo/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/meta_llama-3.1-8b-instruct-turbo/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Llama 3.1 Instruct Turbo (8B)", + "name": "Llama 3.1 Instruct Turbo 8B", "id": "meta/llama-3.1-8b-instruct-turbo", "developer": "meta", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.303, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,12 +47,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -92,12 +104,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -171,13 +192,22 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -221,14 +251,23 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -272,20 +311,29 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -329,24 +377,33 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -390,13 +447,22 @@ } }, "generation_config": { - "stop": "none" + "additional_details": { + "stop": "none" + } } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -440,19 +506,28 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -495,12 +570,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -544,13 +628,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/meta/llama-3.2-11b-vision-instruct-turbo/6ed32ce2-18e5-4d1b-94f8-443f81892275.json b/data/helm_lite/meta/llama-3.2-11b-vision-instruct-turbo/18da1dfa-5366-477b-a9cf-af29c5a99b68.json similarity index 76% rename from data/helm_lite/meta/llama-3.2-11b-vision-instruct-turbo/6ed32ce2-18e5-4d1b-94f8-443f81892275.json rename to data/helm_lite/meta/llama-3.2-11b-vision-instruct-turbo/18da1dfa-5366-477b-a9cf-af29c5a99b68.json index 4daa7f500..722a6f050 100644 --- a/data/helm_lite/meta/llama-3.2-11b-vision-instruct-turbo/6ed32ce2-18e5-4d1b-94f8-443f81892275.json +++ b/data/helm_lite/meta/llama-3.2-11b-vision-instruct-turbo/18da1dfa-5366-477b-a9cf-af29c5a99b68.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/meta_llama-3.2-11b-vision-instruct-turbo/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/meta_llama-3.2-11b-vision-instruct-turbo/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Llama 3.2 Vision Instruct Turbo (11B)", + "name": "Llama 3.2 Vision Instruct Turbo 11B", "id": "meta/llama-3.2-11b-vision-instruct-turbo", "developer": "meta", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.325, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,12 +47,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -92,12 +104,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -171,13 +192,22 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -221,14 +251,23 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -272,20 +311,29 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -329,24 +377,33 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -390,13 +447,22 @@ } }, "generation_config": { - "stop": "none" + "additional_details": { + "stop": "none" + } } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -440,19 +506,28 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -495,12 +570,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -544,13 +628,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/meta/llama-3.2-90b-vision-instruct-turbo/5c11f938-7933-45ae-8530-05dac1012f10.json b/data/helm_lite/meta/llama-3.2-90b-vision-instruct-turbo/80057cc1-45ab-4976-878e-be963eaa83b1.json similarity index 76% rename from data/helm_lite/meta/llama-3.2-90b-vision-instruct-turbo/5c11f938-7933-45ae-8530-05dac1012f10.json rename to data/helm_lite/meta/llama-3.2-90b-vision-instruct-turbo/80057cc1-45ab-4976-878e-be963eaa83b1.json index 17f50b1c8..8bef7c4e9 100644 --- a/data/helm_lite/meta/llama-3.2-90b-vision-instruct-turbo/5c11f938-7933-45ae-8530-05dac1012f10.json +++ b/data/helm_lite/meta/llama-3.2-90b-vision-instruct-turbo/80057cc1-45ab-4976-878e-be963eaa83b1.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/meta_llama-3.2-90b-vision-instruct-turbo/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/meta_llama-3.2-90b-vision-instruct-turbo/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Llama 3.2 Vision Instruct Turbo (90B)", + "name": "Llama 3.2 Vision Instruct Turbo 90B", "id": "meta/llama-3.2-90b-vision-instruct-turbo", "developer": "meta", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.819, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,12 +47,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -92,12 +104,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -171,13 +192,22 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -221,14 +251,23 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -272,20 +311,29 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -329,24 +377,33 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -390,13 +447,22 @@ } }, "generation_config": { - "stop": "none" + "additional_details": { + "stop": "none" + } } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -440,19 +506,28 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -495,12 +570,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -544,13 +628,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/meta/llama-3.3-70b-instruct-turbo/2b9e00e5-15e1-45ea-a345-32a3d84460fb.json b/data/helm_lite/meta/llama-3.3-70b-instruct-turbo/d896249f-bbd9-4657-a5db-5968544cb5fa.json similarity index 76% rename from data/helm_lite/meta/llama-3.3-70b-instruct-turbo/2b9e00e5-15e1-45ea-a345-32a3d84460fb.json rename to data/helm_lite/meta/llama-3.3-70b-instruct-turbo/d896249f-bbd9-4657-a5db-5968544cb5fa.json index 06851628a..cc4cca983 100644 --- a/data/helm_lite/meta/llama-3.3-70b-instruct-turbo/2b9e00e5-15e1-45ea-a345-32a3d84460fb.json +++ b/data/helm_lite/meta/llama-3.3-70b-instruct-turbo/d896249f-bbd9-4657-a5db-5968544cb5fa.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/meta_llama-3.3-70b-instruct-turbo/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/meta_llama-3.3-70b-instruct-turbo/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Llama 3.3 Instruct Turbo (70B)", + "name": "Llama 3.3 Instruct Turbo 70B", "id": "meta/llama-3.3-70b-instruct-turbo", "developer": "meta", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.812, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,12 +47,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -92,12 +104,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -171,13 +192,22 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -221,14 +251,23 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -272,20 +311,29 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -329,24 +377,33 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -390,13 +447,22 @@ } }, "generation_config": { - "stop": "none" + "additional_details": { + "stop": "none" + } } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -440,19 +506,28 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -495,12 +570,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -544,13 +628,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/meta/llama-65b/3e27a5c3-a752-4790-b219-5964331e40ac.json b/data/helm_lite/meta/llama-65b/9f73f3e5-b573-45d4-8c98-82f5c496f786.json similarity index 76% rename from data/helm_lite/meta/llama-65b/3e27a5c3-a752-4790-b219-5964331e40ac.json rename to data/helm_lite/meta/llama-65b/9f73f3e5-b573-45d4-8c98-82f5c496f786.json index 624d96ab6..ebea32b6c 100644 --- a/data/helm_lite/meta/llama-65b/3e27a5c3-a752-4790-b219-5964331e40ac.json +++ b/data/helm_lite/meta/llama-65b/9f73f3e5-b573-45d4-8c98-82f5c496f786.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/meta_llama-65b/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/meta_llama-65b/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "LLaMA (65B)", + "name": "LLaMA 65B", "id": "meta/llama-65b", "developer": "meta", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.345, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,12 +47,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -92,12 +104,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -171,13 +192,22 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -221,14 +251,23 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -272,20 +311,29 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -329,24 +377,33 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -389,12 +446,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -438,19 +504,28 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -493,12 +568,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/microsoft/phi-2/061081c1-6044-40ec-b4a7-1668b8f3ba4f.json b/data/helm_lite/microsoft/phi-2/a06a38e5-c198-4efd-96f3-b52bd7f9c6dc.json similarity index 76% rename from data/helm_lite/microsoft/phi-2/061081c1-6044-40ec-b4a7-1668b8f3ba4f.json rename to data/helm_lite/microsoft/phi-2/a06a38e5-c198-4efd-96f3-b52bd7f9c6dc.json index 42e0ca1f2..ee330c2d2 100644 --- a/data/helm_lite/microsoft/phi-2/061081c1-6044-40ec-b4a7-1668b8f3ba4f.json +++ b/data/helm_lite/microsoft/phi-2/a06a38e5-c198-4efd-96f3-b52bd7f9c6dc.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/microsoft_phi-2/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/microsoft_phi-2/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.169, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,12 +47,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -92,12 +104,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -171,13 +192,22 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -221,14 +251,23 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -272,20 +311,29 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -329,24 +377,33 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -389,12 +446,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -438,19 +504,28 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -493,12 +568,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/microsoft/phi-3-medium-4k-instruct/33df0ce7-048b-4a1b-816c-a6221afe41de.json b/data/helm_lite/microsoft/phi-3-medium-4k-instruct/4ff688da-61a0-43ce-9c2d-e1c197887683.json similarity index 76% rename from data/helm_lite/microsoft/phi-3-medium-4k-instruct/33df0ce7-048b-4a1b-816c-a6221afe41de.json rename to data/helm_lite/microsoft/phi-3-medium-4k-instruct/4ff688da-61a0-43ce-9c2d-e1c197887683.json index 40407df59..6d945026f 100644 --- a/data/helm_lite/microsoft/phi-3-medium-4k-instruct/33df0ce7-048b-4a1b-816c-a6221afe41de.json +++ b/data/helm_lite/microsoft/phi-3-medium-4k-instruct/4ff688da-61a0-43ce-9c2d-e1c197887683.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/microsoft_phi-3-medium-4k-instruct/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/microsoft_phi-3-medium-4k-instruct/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Phi-3 (14B)", + "name": "Phi-3 14B", "id": "microsoft/phi-3-medium-4k-instruct", "developer": "microsoft", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.509, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,12 +47,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -92,12 +104,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -171,13 +192,22 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -221,14 +251,23 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -272,20 +311,29 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -329,24 +377,33 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -389,12 +446,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -438,19 +504,28 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -493,12 +568,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/microsoft/phi-3-small-8k-instruct/a3f47cc2-0563-4285-b777-0fcc3c642249.json b/data/helm_lite/microsoft/phi-3-small-8k-instruct/181003ea-7587-4c93-8b89-c5c76958313d.json similarity index 76% rename from data/helm_lite/microsoft/phi-3-small-8k-instruct/a3f47cc2-0563-4285-b777-0fcc3c642249.json rename to data/helm_lite/microsoft/phi-3-small-8k-instruct/181003ea-7587-4c93-8b89-c5c76958313d.json index 4a88d2532..c7b88764b 100644 --- a/data/helm_lite/microsoft/phi-3-small-8k-instruct/a3f47cc2-0563-4285-b777-0fcc3c642249.json +++ b/data/helm_lite/microsoft/phi-3-small-8k-instruct/181003ea-7587-4c93-8b89-c5c76958313d.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/microsoft_phi-3-small-8k-instruct/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/microsoft_phi-3-small-8k-instruct/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Phi-3 (7B)", + "name": "Phi-3 7B", "id": "microsoft/phi-3-small-8k-instruct", "developer": "microsoft", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.473, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,12 +47,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -92,12 +104,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -171,13 +192,22 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -221,14 +251,23 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -272,20 +311,29 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -329,24 +377,33 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -389,12 +446,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -438,19 +504,28 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -493,12 +568,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/mistralai/mistral-7b-instruct-v0.3/067ef4d7-387c-4c09-a1c4-a10af69811f0.json b/data/helm_lite/mistralai/mistral-7b-instruct-v0.3/66688228-e59a-4caa-b3fb-c5df1efc9db4.json similarity index 76% rename from data/helm_lite/mistralai/mistral-7b-instruct-v0.3/067ef4d7-387c-4c09-a1c4-a10af69811f0.json rename to data/helm_lite/mistralai/mistral-7b-instruct-v0.3/66688228-e59a-4caa-b3fb-c5df1efc9db4.json index 81cb62772..fd0f8e02b 100644 --- a/data/helm_lite/mistralai/mistral-7b-instruct-v0.3/067ef4d7-387c-4c09-a1c4-a10af69811f0.json +++ b/data/helm_lite/mistralai/mistral-7b-instruct-v0.3/66688228-e59a-4caa-b3fb-c5df1efc9db4.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/mistralai_mistral-7b-instruct-v0.3/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/mistralai_mistral-7b-instruct-v0.3/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Mistral Instruct v0.3 (7B)", + "name": "Mistral Instruct v0.3 7B", "id": "mistralai/mistral-7b-instruct-v0.3", "developer": "mistralai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.196, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,12 +47,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -92,12 +104,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -171,13 +192,22 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -221,14 +251,23 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -272,20 +311,29 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -329,24 +377,33 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -390,13 +447,22 @@ } }, "generation_config": { - "stop": "none" + "additional_details": { + "stop": "none" + } } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -440,19 +506,28 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -495,12 +570,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -544,13 +628,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/mistralai/mistral-7b-v0.1/0a07f39c-745a-46c3-ad11-c79a50cc18bb.json b/data/helm_lite/mistralai/mistral-7b-v0.1/2d7d8bac-714e-49a8-a1a7-d88d759fe60a.json similarity index 76% rename from data/helm_lite/mistralai/mistral-7b-v0.1/0a07f39c-745a-46c3-ad11-c79a50cc18bb.json rename to data/helm_lite/mistralai/mistral-7b-v0.1/2d7d8bac-714e-49a8-a1a7-d88d759fe60a.json index 17ebd8348..8f4801f23 100644 --- a/data/helm_lite/mistralai/mistral-7b-v0.1/0a07f39c-745a-46c3-ad11-c79a50cc18bb.json +++ b/data/helm_lite/mistralai/mistral-7b-v0.1/2d7d8bac-714e-49a8-a1a7-d88d759fe60a.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/mistralai_mistral-7b-v0.1/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/mistralai_mistral-7b-v0.1/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Mistral v0.1 (7B)", + "name": "Mistral v0.1 7B", "id": "mistralai/mistral-7b-v0.1", "developer": "mistralai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.292, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,12 +47,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -92,12 +104,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -171,13 +192,22 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -221,14 +251,23 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -272,20 +311,29 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -329,24 +377,33 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -389,12 +446,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -438,19 +504,28 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -493,12 +568,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/mistralai/mistral-large-2402/35797854-d46a-4646-94a2-3acf1d484418.json b/data/helm_lite/mistralai/mistral-large-2402/077fe37f-b3a4-483a-93a5-034c6445fe98.json similarity index 76% rename from data/helm_lite/mistralai/mistral-large-2402/35797854-d46a-4646-94a2-3acf1d484418.json rename to data/helm_lite/mistralai/mistral-large-2402/077fe37f-b3a4-483a-93a5-034c6445fe98.json index ca506f27c..d8d60cc37 100644 --- a/data/helm_lite/mistralai/mistral-large-2402/35797854-d46a-4646-94a2-3acf1d484418.json +++ b/data/helm_lite/mistralai/mistral-large-2402/077fe37f-b3a4-483a-93a5-034c6445fe98.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/mistralai_mistral-large-2402/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/mistralai_mistral-large-2402/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Mistral Large (2402)", + "name": "Mistral Large 2402", "id": "mistralai/mistral-large-2402", "developer": "mistralai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.328, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,12 +47,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -92,12 +104,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -171,13 +192,22 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -221,14 +251,23 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -272,20 +311,29 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -329,24 +377,33 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -389,12 +446,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -438,19 +504,28 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -493,12 +568,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/mistralai/mistral-large-2407/3f1d4124-11ca-43af-ae0a-ae08b05d2a73.json b/data/helm_lite/mistralai/mistral-large-2407/4fbb173c-b900-4e11-87bd-1ac6a489d014.json similarity index 76% rename from data/helm_lite/mistralai/mistral-large-2407/3f1d4124-11ca-43af-ae0a-ae08b05d2a73.json rename to data/helm_lite/mistralai/mistral-large-2407/4fbb173c-b900-4e11-87bd-1ac6a489d014.json index a10172374..d75c9932b 100644 --- a/data/helm_lite/mistralai/mistral-large-2407/3f1d4124-11ca-43af-ae0a-ae08b05d2a73.json +++ b/data/helm_lite/mistralai/mistral-large-2407/4fbb173c-b900-4e11-87bd-1ac6a489d014.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/mistralai_mistral-large-2407/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/mistralai_mistral-large-2407/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Mistral Large 2 (2407)", + "name": "Mistral Large 2 2407", "id": "mistralai/mistral-large-2407", "developer": "mistralai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.744, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,12 +47,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -92,12 +104,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -171,13 +192,22 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -221,14 +251,23 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -272,20 +311,29 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -329,24 +377,33 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -390,13 +447,22 @@ } }, "generation_config": { - "stop": "none" + "additional_details": { + "stop": "none" + } } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -440,19 +506,28 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -495,12 +570,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -544,13 +628,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/mistralai/mistral-medium-2312/33bd2b4e-0292-47b7-84de-de6ff5804257.json b/data/helm_lite/mistralai/mistral-medium-2312/e56e8834-27d7-44e7-b5bb-907a4d7b6a58.json similarity index 76% rename from data/helm_lite/mistralai/mistral-medium-2312/33bd2b4e-0292-47b7-84de-de6ff5804257.json rename to data/helm_lite/mistralai/mistral-medium-2312/e56e8834-27d7-44e7-b5bb-907a4d7b6a58.json index 966d4c393..6bb7115e2 100644 --- a/data/helm_lite/mistralai/mistral-medium-2312/33bd2b4e-0292-47b7-84de-de6ff5804257.json +++ b/data/helm_lite/mistralai/mistral-medium-2312/e56e8834-27d7-44e7-b5bb-907a4d7b6a58.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/mistralai_mistral-medium-2312/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/mistralai_mistral-medium-2312/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Mistral Medium (2312)", + "name": "Mistral Medium 2312", "id": "mistralai/mistral-medium-2312", "developer": "mistralai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.268, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,12 +47,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -92,12 +104,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -171,13 +192,22 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -221,14 +251,23 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -272,20 +311,29 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -329,24 +377,33 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -389,12 +446,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -438,19 +504,28 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -493,12 +568,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/mistralai/mistral-small-2402/67edb54d-efed-4a23-97ef-6d2a9f254ae1.json b/data/helm_lite/mistralai/mistral-small-2402/0925f9b7-08f8-485f-84bc-a153a54aa417.json similarity index 76% rename from data/helm_lite/mistralai/mistral-small-2402/67edb54d-efed-4a23-97ef-6d2a9f254ae1.json rename to data/helm_lite/mistralai/mistral-small-2402/0925f9b7-08f8-485f-84bc-a153a54aa417.json index 039a9d5cc..1f2cb2632 100644 --- a/data/helm_lite/mistralai/mistral-small-2402/67edb54d-efed-4a23-97ef-6d2a9f254ae1.json +++ b/data/helm_lite/mistralai/mistral-small-2402/0925f9b7-08f8-485f-84bc-a153a54aa417.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/mistralai_mistral-small-2402/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/mistralai_mistral-small-2402/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Mistral Small (2402)", + "name": "Mistral Small 2402", "id": "mistralai/mistral-small-2402", "developer": "mistralai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.288, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,12 +47,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -92,12 +104,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -171,13 +192,22 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -221,14 +251,23 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -272,20 +311,29 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -329,24 +377,33 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -389,12 +446,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -438,19 +504,28 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -493,12 +568,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/mistralai/mixtral-8x22b/ba5dc39a-9a5b-4523-be26-b8d784c2a5ef.json b/data/helm_lite/mistralai/mixtral-8x22b/08082277-8305-4007-97cd-88202fc0115c.json similarity index 76% rename from data/helm_lite/mistralai/mixtral-8x22b/ba5dc39a-9a5b-4523-be26-b8d784c2a5ef.json rename to data/helm_lite/mistralai/mixtral-8x22b/08082277-8305-4007-97cd-88202fc0115c.json index 781bbb2c8..e6bfd0332 100644 --- a/data/helm_lite/mistralai/mixtral-8x22b/ba5dc39a-9a5b-4523-be26-b8d784c2a5ef.json +++ b/data/helm_lite/mistralai/mixtral-8x22b/08082277-8305-4007-97cd-88202fc0115c.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/mistralai_mixtral-8x22b/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/mistralai_mixtral-8x22b/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Mixtral (8x22B)", + "name": "Mixtral 8x22B", "id": "mistralai/mixtral-8x22b", "developer": "mistralai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.705, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,12 +47,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -92,12 +104,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -171,13 +192,22 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -221,14 +251,23 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -272,20 +311,29 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -329,24 +377,33 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -389,12 +446,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -438,19 +504,28 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -493,12 +568,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/mistralai/mixtral-8x7b-32kseqlen/469d069f-581e-415c-9c9d-f57e7c972da5.json b/data/helm_lite/mistralai/mixtral-8x7b-32kseqlen/fe554cbd-2480-40bd-b2f5-464cad700c14.json similarity index 76% rename from data/helm_lite/mistralai/mixtral-8x7b-32kseqlen/469d069f-581e-415c-9c9d-f57e7c972da5.json rename to data/helm_lite/mistralai/mixtral-8x7b-32kseqlen/fe554cbd-2480-40bd-b2f5-464cad700c14.json index 818a4bd2a..7bf0323b1 100644 --- a/data/helm_lite/mistralai/mixtral-8x7b-32kseqlen/469d069f-581e-415c-9c9d-f57e7c972da5.json +++ b/data/helm_lite/mistralai/mixtral-8x7b-32kseqlen/fe554cbd-2480-40bd-b2f5-464cad700c14.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/mistralai_mixtral-8x7b-32kseqlen/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/mistralai_mixtral-8x7b-32kseqlen/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Mixtral (8x7B 32K seqlen)", + "name": "Mixtral 8x7B 32K seqlen", "id": "mistralai/mixtral-8x7b-32kseqlen", "developer": "mistralai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.51, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,12 +47,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -92,12 +104,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -171,13 +192,22 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -221,14 +251,23 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -272,20 +311,29 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -329,24 +377,33 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -389,12 +446,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -438,19 +504,28 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -493,12 +568,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/mistralai/open-mistral-nemo-2407/c9a3f927-041f-47cf-ae02-03fe4be0a59e.json b/data/helm_lite/mistralai/open-mistral-nemo-2407/9d048af8-b1cb-49cb-b8ab-ab0948deacd7.json similarity index 76% rename from data/helm_lite/mistralai/open-mistral-nemo-2407/c9a3f927-041f-47cf-ae02-03fe4be0a59e.json rename to data/helm_lite/mistralai/open-mistral-nemo-2407/9d048af8-b1cb-49cb-b8ab-ab0948deacd7.json index dfc851db9..7fee5cb57 100644 --- a/data/helm_lite/mistralai/open-mistral-nemo-2407/c9a3f927-041f-47cf-ae02-03fe4be0a59e.json +++ b/data/helm_lite/mistralai/open-mistral-nemo-2407/9d048af8-b1cb-49cb-b8ab-ab0948deacd7.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/mistralai_open-mistral-nemo-2407/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/mistralai_open-mistral-nemo-2407/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Mistral NeMo (2402)", + "name": "Mistral NeMo 2402", "id": "mistralai/open-mistral-nemo-2407", "developer": "mistralai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.333, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,12 +47,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -92,12 +104,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -171,13 +192,22 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -221,14 +251,23 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -272,20 +311,29 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -329,24 +377,33 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -390,13 +447,22 @@ } }, "generation_config": { - "stop": "none" + "additional_details": { + "stop": "none" + } } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -440,19 +506,28 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -495,12 +570,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -544,13 +628,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/openai/gpt-3.5-turbo-0613/1a8c4f2e-04a0-4c08-8966-d7eaa7dd6462.json b/data/helm_lite/openai/gpt-3.5-turbo-0613/d9654997-1d3e-41c3-9f16-05a36dde9b02.json similarity index 76% rename from data/helm_lite/openai/gpt-3.5-turbo-0613/1a8c4f2e-04a0-4c08-8966-d7eaa7dd6462.json rename to data/helm_lite/openai/gpt-3.5-turbo-0613/d9654997-1d3e-41c3-9f16-05a36dde9b02.json index 28acf453d..878d33981 100644 --- a/data/helm_lite/openai/gpt-3.5-turbo-0613/1a8c4f2e-04a0-4c08-8966-d7eaa7dd6462.json +++ b/data/helm_lite/openai/gpt-3.5-turbo-0613/d9654997-1d3e-41c3-9f16-05a36dde9b02.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/openai_gpt-3.5-turbo-0613/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/openai_gpt-3.5-turbo-0613/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "GPT-3.5 Turbo (0613)", + "name": "GPT-3.5 Turbo 0613", "id": "openai/gpt-3.5-turbo-0613", "developer": "openai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.358, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,12 +47,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -92,12 +104,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -171,13 +192,22 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -221,14 +251,23 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -272,20 +311,29 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -329,24 +377,33 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -389,12 +446,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -438,19 +504,28 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -493,12 +568,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/openai/gpt-4-0613/4e58fdd9-e14c-441a-a9fb-4c525a615880.json b/data/helm_lite/openai/gpt-4-0613/73d6b1fe-3f58-4640-b24b-e12b9ea1aca3.json similarity index 76% rename from data/helm_lite/openai/gpt-4-0613/4e58fdd9-e14c-441a-a9fb-4c525a615880.json rename to data/helm_lite/openai/gpt-4-0613/73d6b1fe-3f58-4640-b24b-e12b9ea1aca3.json index 6fa2534b1..7ff111f74 100644 --- a/data/helm_lite/openai/gpt-4-0613/4e58fdd9-e14c-441a-a9fb-4c525a615880.json +++ b/data/helm_lite/openai/gpt-4-0613/73d6b1fe-3f58-4640-b24b-e12b9ea1aca3.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/openai_gpt-4-0613/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/openai_gpt-4-0613/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "GPT-4 (0613)", + "name": "GPT-4 0613", "id": "openai/gpt-4-0613", "developer": "openai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.867, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,12 +47,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -92,12 +104,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -171,13 +192,22 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -221,14 +251,23 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -272,20 +311,29 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -329,24 +377,33 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -389,12 +446,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -438,19 +504,28 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -493,12 +568,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/openai/gpt-4-1106-preview/252ec309-9b98-463e-aee4-6e28deb6dcfb.json b/data/helm_lite/openai/gpt-4-1106-preview/4d01d929-b5e2-42dc-89ee-20560f560db5.json similarity index 76% rename from data/helm_lite/openai/gpt-4-1106-preview/252ec309-9b98-463e-aee4-6e28deb6dcfb.json rename to data/helm_lite/openai/gpt-4-1106-preview/4d01d929-b5e2-42dc-89ee-20560f560db5.json index c0d921b54..060ab8fb5 100644 --- a/data/helm_lite/openai/gpt-4-1106-preview/252ec309-9b98-463e-aee4-6e28deb6dcfb.json +++ b/data/helm_lite/openai/gpt-4-1106-preview/4d01d929-b5e2-42dc-89ee-20560f560db5.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/openai_gpt-4-1106-preview/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/openai_gpt-4-1106-preview/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "GPT-4 Turbo (1106 preview)", + "name": "GPT-4 Turbo 1106 preview", "id": "openai/gpt-4-1106-preview", "developer": "openai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.698, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,12 +47,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -92,12 +104,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -171,13 +192,22 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -221,14 +251,23 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -272,20 +311,29 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -329,24 +377,33 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -389,12 +446,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -438,19 +504,28 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -493,12 +568,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/openai/gpt-4-turbo-2024-04-09/5530c426-2321-4aa3-b860-f9b764b7b748.json b/data/helm_lite/openai/gpt-4-turbo-2024-04-09/76c78ade-2ad6-4c85-93c9-65c4b6b249b7.json similarity index 76% rename from data/helm_lite/openai/gpt-4-turbo-2024-04-09/5530c426-2321-4aa3-b860-f9b764b7b748.json rename to data/helm_lite/openai/gpt-4-turbo-2024-04-09/76c78ade-2ad6-4c85-93c9-65c4b6b249b7.json index 599344447..dae83b652 100644 --- a/data/helm_lite/openai/gpt-4-turbo-2024-04-09/5530c426-2321-4aa3-b860-f9b764b7b748.json +++ b/data/helm_lite/openai/gpt-4-turbo-2024-04-09/76c78ade-2ad6-4c85-93c9-65c4b6b249b7.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/openai_gpt-4-turbo-2024-04-09/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/openai_gpt-4-turbo-2024-04-09/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "GPT-4 Turbo (2024-04-09)", + "name": "GPT-4 Turbo 2024-04-09", "id": "openai/gpt-4-turbo-2024-04-09", "developer": "openai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.864, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,12 +47,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -92,12 +104,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -171,13 +192,22 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -221,14 +251,23 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -272,20 +311,29 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -329,24 +377,33 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -390,13 +447,22 @@ } }, "generation_config": { - "stop": "none" + "additional_details": { + "stop": "none" + } } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -440,19 +506,28 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -495,12 +570,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -544,13 +628,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/openai/gpt-4o-2024-05-13/da92cfe0-b066-416a-9408-3eb9d36b9fb3.json b/data/helm_lite/openai/gpt-4o-2024-05-13/69ea0ef0-c136-4cff-9607-6ae12e0692c3.json similarity index 76% rename from data/helm_lite/openai/gpt-4o-2024-05-13/da92cfe0-b066-416a-9408-3eb9d36b9fb3.json rename to data/helm_lite/openai/gpt-4o-2024-05-13/69ea0ef0-c136-4cff-9607-6ae12e0692c3.json index 98feb8bc0..c23053f17 100644 --- a/data/helm_lite/openai/gpt-4o-2024-05-13/da92cfe0-b066-416a-9408-3eb9d36b9fb3.json +++ b/data/helm_lite/openai/gpt-4o-2024-05-13/69ea0ef0-c136-4cff-9607-6ae12e0692c3.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/openai_gpt-4o-2024-05-13/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/openai_gpt-4o-2024-05-13/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "GPT-4o (2024-05-13)", + "name": "GPT-4o 2024-05-13", "id": "openai/gpt-4o-2024-05-13", "developer": "openai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.938, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,12 +47,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -92,12 +104,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -171,13 +192,22 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -221,14 +251,23 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -272,20 +311,29 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -329,24 +377,33 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -390,13 +447,22 @@ } }, "generation_config": { - "stop": "none" + "additional_details": { + "stop": "none" + } } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -440,19 +506,28 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -495,12 +570,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -544,13 +628,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/openai/gpt-4o-2024-08-06/2a752701-a826-4316-b3eb-e9eec90a5a89.json b/data/helm_lite/openai/gpt-4o-2024-08-06/bbe708f3-fb78-49e9-876d-cae57f1231cc.json similarity index 76% rename from data/helm_lite/openai/gpt-4o-2024-08-06/2a752701-a826-4316-b3eb-e9eec90a5a89.json rename to data/helm_lite/openai/gpt-4o-2024-08-06/bbe708f3-fb78-49e9-876d-cae57f1231cc.json index cb595e51b..f8d7c3614 100644 --- a/data/helm_lite/openai/gpt-4o-2024-08-06/2a752701-a826-4316-b3eb-e9eec90a5a89.json +++ b/data/helm_lite/openai/gpt-4o-2024-08-06/bbe708f3-fb78-49e9-876d-cae57f1231cc.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/openai_gpt-4o-2024-08-06/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/openai_gpt-4o-2024-08-06/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "GPT-4o (2024-08-06)", + "name": "GPT-4o 2024-08-06", "id": "openai/gpt-4o-2024-08-06", "developer": "openai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.928, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,12 +47,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -92,12 +104,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -171,13 +192,22 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -221,14 +251,23 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -272,20 +311,29 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -329,24 +377,33 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -390,13 +447,22 @@ } }, "generation_config": { - "stop": "none" + "additional_details": { + "stop": "none" + } } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -440,19 +506,28 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -495,12 +570,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -544,13 +628,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/openai/gpt-4o-mini-2024-07-18/bea4af4b-8155-4784-9192-b40270d574af.json b/data/helm_lite/openai/gpt-4o-mini-2024-07-18/ab7b7951-0792-4538-8a7a-6baee8602cbb.json similarity index 76% rename from data/helm_lite/openai/gpt-4o-mini-2024-07-18/bea4af4b-8155-4784-9192-b40270d574af.json rename to data/helm_lite/openai/gpt-4o-mini-2024-07-18/ab7b7951-0792-4538-8a7a-6baee8602cbb.json index 3fb056373..3869cb246 100644 --- a/data/helm_lite/openai/gpt-4o-mini-2024-07-18/bea4af4b-8155-4784-9192-b40270d574af.json +++ b/data/helm_lite/openai/gpt-4o-mini-2024-07-18/ab7b7951-0792-4538-8a7a-6baee8602cbb.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/openai_gpt-4o-mini-2024-07-18/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/openai_gpt-4o-mini-2024-07-18/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "GPT-4o mini (2024-07-18)", + "name": "GPT-4o mini 2024-07-18", "id": "openai/gpt-4o-mini-2024-07-18", "developer": "openai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.701, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,12 +47,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -92,12 +104,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -171,13 +192,22 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -221,14 +251,23 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -272,20 +311,29 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -329,24 +377,33 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -390,13 +447,22 @@ } }, "generation_config": { - "stop": "none" + "additional_details": { + "stop": "none" + } } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -440,19 +506,28 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -495,12 +570,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -544,13 +628,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/openai/text-davinci-002/d08eccd1-602c-4d64-a487-2d9c028459a0.json b/data/helm_lite/openai/text-davinci-002/fc94c95d-9678-4f23-b82f-190a08ece307.json similarity index 76% rename from data/helm_lite/openai/text-davinci-002/d08eccd1-602c-4d64-a487-2d9c028459a0.json rename to data/helm_lite/openai/text-davinci-002/fc94c95d-9678-4f23-b82f-190a08ece307.json index d390f5b2a..f3294dd85 100644 --- a/data/helm_lite/openai/text-davinci-002/d08eccd1-602c-4d64-a487-2d9c028459a0.json +++ b/data/helm_lite/openai/text-davinci-002/fc94c95d-9678-4f23-b82f-190a08ece307.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/openai_text-davinci-002/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/openai_text-davinci-002/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "GPT-3.5 (text-davinci-002)", + "name": "GPT-3.5 text-davinci-002", "id": "openai/text-davinci-002", "developer": "openai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.336, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,12 +47,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -92,12 +104,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -171,13 +192,22 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -221,14 +251,23 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -272,20 +311,29 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -329,24 +377,33 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -389,12 +446,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -438,19 +504,28 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -493,12 +568,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/openai/text-davinci-003/3cceb22d-7ce9-49a1-a677-548a97c52970.json b/data/helm_lite/openai/text-davinci-003/3f92e2fc-9831-4c2c-b94e-af33d457fa82.json similarity index 76% rename from data/helm_lite/openai/text-davinci-003/3cceb22d-7ce9-49a1-a677-548a97c52970.json rename to data/helm_lite/openai/text-davinci-003/3f92e2fc-9831-4c2c-b94e-af33d457fa82.json index 99961f779..93f27df2b 100644 --- a/data/helm_lite/openai/text-davinci-003/3cceb22d-7ce9-49a1-a677-548a97c52970.json +++ b/data/helm_lite/openai/text-davinci-003/3f92e2fc-9831-4c2c-b94e-af33d457fa82.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/openai_text-davinci-003/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/openai_text-davinci-003/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "GPT-3.5 (text-davinci-003)", + "name": "GPT-3.5 text-davinci-003", "id": "openai/text-davinci-003", "developer": "openai", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.439, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,12 +47,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -92,12 +104,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -171,13 +192,22 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -221,14 +251,23 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -272,20 +311,29 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -329,24 +377,33 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -389,12 +446,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -438,19 +504,28 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -493,12 +568,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/qwen/qwen1.5-110b-chat/6fd88ffb-a8b3-4f30-be39-38d4532ca16d.json b/data/helm_lite/qwen/qwen1.5-110b-chat/3e3c79f0-5fb8-4a3f-8c9b-53f742ec2f43.json similarity index 76% rename from data/helm_lite/qwen/qwen1.5-110b-chat/6fd88ffb-a8b3-4f30-be39-38d4532ca16d.json rename to data/helm_lite/qwen/qwen1.5-110b-chat/3e3c79f0-5fb8-4a3f-8c9b-53f742ec2f43.json index 6aed691a1..800f57826 100644 --- a/data/helm_lite/qwen/qwen1.5-110b-chat/6fd88ffb-a8b3-4f30-be39-38d4532ca16d.json +++ b/data/helm_lite/qwen/qwen1.5-110b-chat/3e3c79f0-5fb8-4a3f-8c9b-53f742ec2f43.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/qwen_qwen1.5-110b-chat/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/qwen_qwen1.5-110b-chat/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Qwen1.5 Chat (110B)", + "name": "Qwen1.5 Chat 110B", "id": "qwen/qwen1.5-110b-chat", "developer": "qwen", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.55, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,12 +47,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -92,12 +104,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -171,13 +192,22 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -221,14 +251,23 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -272,20 +311,29 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -329,24 +377,33 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -390,13 +447,22 @@ } }, "generation_config": { - "stop": "none" + "additional_details": { + "stop": "none" + } } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -440,19 +506,28 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -495,12 +570,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -544,13 +628,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/qwen/qwen1.5-14b/9b1ee735-bc25-48fd-94cd-24f17edcdc21.json b/data/helm_lite/qwen/qwen1.5-14b/6b2891bd-2444-4286-8ccf-c91181856d29.json similarity index 76% rename from data/helm_lite/qwen/qwen1.5-14b/9b1ee735-bc25-48fd-94cd-24f17edcdc21.json rename to data/helm_lite/qwen/qwen1.5-14b/6b2891bd-2444-4286-8ccf-c91181856d29.json index f6c7858eb..c8749e5f5 100644 --- a/data/helm_lite/qwen/qwen1.5-14b/9b1ee735-bc25-48fd-94cd-24f17edcdc21.json +++ b/data/helm_lite/qwen/qwen1.5-14b/6b2891bd-2444-4286-8ccf-c91181856d29.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/qwen_qwen1.5-14b/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/qwen_qwen1.5-14b/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Qwen1.5 (14B)", + "name": "Qwen1.5 14B", "id": "qwen/qwen1.5-14b", "developer": "qwen", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.425, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,12 +47,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -92,12 +104,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -171,13 +192,22 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -221,14 +251,23 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -272,20 +311,29 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -329,24 +377,33 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -389,12 +446,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -438,19 +504,28 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -493,12 +568,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/qwen/qwen1.5-32b/a648cb90-bcce-4171-a664-df0b19304833.json b/data/helm_lite/qwen/qwen1.5-32b/bd924bd3-e13c-48e0-b339-8c15c5072038.json similarity index 76% rename from data/helm_lite/qwen/qwen1.5-32b/a648cb90-bcce-4171-a664-df0b19304833.json rename to data/helm_lite/qwen/qwen1.5-32b/bd924bd3-e13c-48e0-b339-8c15c5072038.json index 1314aa204..699c1515b 100644 --- a/data/helm_lite/qwen/qwen1.5-32b/a648cb90-bcce-4171-a664-df0b19304833.json +++ b/data/helm_lite/qwen/qwen1.5-32b/bd924bd3-e13c-48e0-b339-8c15c5072038.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/qwen_qwen1.5-32b/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/qwen_qwen1.5-32b/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Qwen1.5 (32B)", + "name": "Qwen1.5 32B", "id": "qwen/qwen1.5-32b", "developer": "qwen", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.546, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,12 +47,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -92,12 +104,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -171,13 +192,22 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -221,14 +251,23 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -272,20 +311,29 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -329,24 +377,33 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -389,12 +446,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -438,19 +504,28 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -493,12 +568,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/qwen/qwen1.5-72b/5dace0c5-46f5-4ad4-ac48-1daacee28fe6.json b/data/helm_lite/qwen/qwen1.5-72b/b8a6f32a-9904-43bb-9add-89404093a9db.json similarity index 76% rename from data/helm_lite/qwen/qwen1.5-72b/5dace0c5-46f5-4ad4-ac48-1daacee28fe6.json rename to data/helm_lite/qwen/qwen1.5-72b/b8a6f32a-9904-43bb-9add-89404093a9db.json index 6da42bd5b..8b347b68d 100644 --- a/data/helm_lite/qwen/qwen1.5-72b/5dace0c5-46f5-4ad4-ac48-1daacee28fe6.json +++ b/data/helm_lite/qwen/qwen1.5-72b/b8a6f32a-9904-43bb-9add-89404093a9db.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/qwen_qwen1.5-72b/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/qwen_qwen1.5-72b/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Qwen1.5 (72B)", + "name": "Qwen1.5 72B", "id": "qwen/qwen1.5-72b", "developer": "qwen", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.608, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,12 +47,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -92,12 +104,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -171,13 +192,22 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -221,14 +251,23 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -272,20 +311,29 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -329,24 +377,33 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -389,12 +446,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -438,19 +504,28 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -493,12 +568,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/qwen/qwen1.5-7b/71d69629-11b9-4f06-98ca-536f1ab22f2c.json b/data/helm_lite/qwen/qwen1.5-7b/c49e4b98-49c5-485b-8f16-0eeed2d9cd82.json similarity index 76% rename from data/helm_lite/qwen/qwen1.5-7b/71d69629-11b9-4f06-98ca-536f1ab22f2c.json rename to data/helm_lite/qwen/qwen1.5-7b/c49e4b98-49c5-485b-8f16-0eeed2d9cd82.json index a4d0226b9..b1bc89d92 100644 --- a/data/helm_lite/qwen/qwen1.5-7b/71d69629-11b9-4f06-98ca-536f1ab22f2c.json +++ b/data/helm_lite/qwen/qwen1.5-7b/c49e4b98-49c5-485b-8f16-0eeed2d9cd82.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/qwen_qwen1.5-7b/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/qwen_qwen1.5-7b/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Qwen1.5 (7B)", + "name": "Qwen1.5 7B", "id": "qwen/qwen1.5-7b", "developer": "qwen", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.275, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,12 +47,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -92,12 +104,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -171,13 +192,22 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -221,14 +251,23 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -272,20 +311,29 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -329,24 +377,33 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -389,12 +446,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -438,19 +504,28 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -493,12 +568,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/qwen/qwen2-72b-instruct/a594b434-eeb2-41f5-b23d-eea23ed2add2.json b/data/helm_lite/qwen/qwen2-72b-instruct/9c1fc50a-437d-458b-926c-33cabdcc4aeb.json similarity index 76% rename from data/helm_lite/qwen/qwen2-72b-instruct/a594b434-eeb2-41f5-b23d-eea23ed2add2.json rename to data/helm_lite/qwen/qwen2-72b-instruct/9c1fc50a-437d-458b-926c-33cabdcc4aeb.json index 4e8665e6b..58edcde03 100644 --- a/data/helm_lite/qwen/qwen2-72b-instruct/a594b434-eeb2-41f5-b23d-eea23ed2add2.json +++ b/data/helm_lite/qwen/qwen2-72b-instruct/9c1fc50a-437d-458b-926c-33cabdcc4aeb.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/qwen_qwen2-72b-instruct/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/qwen_qwen2-72b-instruct/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Qwen2 Instruct (72B)", + "name": "Qwen2 Instruct 72B", "id": "qwen/qwen2-72b-instruct", "developer": "qwen", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.77, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,12 +47,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -92,12 +104,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -171,13 +192,22 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -221,14 +251,23 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -272,20 +311,29 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -329,24 +377,33 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -390,13 +447,22 @@ } }, "generation_config": { - "stop": "none" + "additional_details": { + "stop": "none" + } } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -440,19 +506,28 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -495,12 +570,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -544,13 +628,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/qwen/qwen2.5-72b-instruct-turbo/e6a833e5-6b86-4d32-be03-010fdfde3ffc.json b/data/helm_lite/qwen/qwen2.5-72b-instruct-turbo/5e0e911a-79b0-46fe-88eb-f9ae8cbdd642.json similarity index 76% rename from data/helm_lite/qwen/qwen2.5-72b-instruct-turbo/e6a833e5-6b86-4d32-be03-010fdfde3ffc.json rename to data/helm_lite/qwen/qwen2.5-72b-instruct-turbo/5e0e911a-79b0-46fe-88eb-f9ae8cbdd642.json index 9e7699d4b..3e08a0cdf 100644 --- a/data/helm_lite/qwen/qwen2.5-72b-instruct-turbo/e6a833e5-6b86-4d32-be03-010fdfde3ffc.json +++ b/data/helm_lite/qwen/qwen2.5-72b-instruct-turbo/5e0e911a-79b0-46fe-88eb-f9ae8cbdd642.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/qwen_qwen2.5-72b-instruct-turbo/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/qwen_qwen2.5-72b-instruct-turbo/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Qwen2.5 Instruct Turbo (72B)", + "name": "Qwen2.5 Instruct Turbo 72B", "id": "qwen/qwen2.5-72b-instruct-turbo", "developer": "qwen", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.745, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,12 +47,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -92,12 +104,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -171,13 +192,22 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -221,14 +251,23 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -272,20 +311,29 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -329,24 +377,33 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -390,13 +447,22 @@ } }, "generation_config": { - "stop": "none" + "additional_details": { + "stop": "none" + } } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -440,20 +506,29 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ], - "stop": "none" + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ], + "stop": "none" + } } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -496,12 +571,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -545,13 +629,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/qwen/qwen2.5-7b-instruct-turbo/cc7d5cc5-d91e-4e54-bbff-dfc867586c77.json b/data/helm_lite/qwen/qwen2.5-7b-instruct-turbo/10e1abfa-83de-4960-8d4c-c5099894cb80.json similarity index 76% rename from data/helm_lite/qwen/qwen2.5-7b-instruct-turbo/cc7d5cc5-d91e-4e54-bbff-dfc867586c77.json rename to data/helm_lite/qwen/qwen2.5-7b-instruct-turbo/10e1abfa-83de-4960-8d4c-c5099894cb80.json index 126ae4e72..3f844c281 100644 --- a/data/helm_lite/qwen/qwen2.5-7b-instruct-turbo/cc7d5cc5-d91e-4e54-bbff-dfc867586c77.json +++ b/data/helm_lite/qwen/qwen2.5-7b-instruct-turbo/10e1abfa-83de-4960-8d4c-c5099894cb80.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/qwen_qwen2.5-7b-instruct-turbo/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/qwen_qwen2.5-7b-instruct-turbo/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Qwen2.5 Instruct Turbo (7B)", + "name": "Qwen2.5 Instruct Turbo 7B", "id": "qwen/qwen2.5-7b-instruct-turbo", "developer": "qwen", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.488, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,12 +47,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -92,12 +104,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -171,13 +192,22 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -221,14 +251,23 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -272,20 +311,29 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -329,24 +377,33 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -390,13 +447,22 @@ } }, "generation_config": { - "stop": "none" + "additional_details": { + "stop": "none" + } } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -440,20 +506,29 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ], - "stop": "none" + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ], + "stop": "none" + } } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -496,12 +571,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -545,13 +629,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/snowflake/snowflake-arctic-instruct/2fb84697-ac0c-4d3f-a2be-74a9bd3f5797.json b/data/helm_lite/snowflake/snowflake-arctic-instruct/40aa244f-a5dd-4e02-9ca5-6edaf755b79f.json similarity index 77% rename from data/helm_lite/snowflake/snowflake-arctic-instruct/2fb84697-ac0c-4d3f-a2be-74a9bd3f5797.json rename to data/helm_lite/snowflake/snowflake-arctic-instruct/40aa244f-a5dd-4e02-9ca5-6edaf755b79f.json index a52059819..09f377d89 100644 --- a/data/helm_lite/snowflake/snowflake-arctic-instruct/2fb84697-ac0c-4d3f-a2be-74a9bd3f5797.json +++ b/data/helm_lite/snowflake/snowflake-arctic-instruct/40aa244f-a5dd-4e02-9ca5-6edaf755b79f.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/snowflake_snowflake-arctic-instruct/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/snowflake_snowflake-arctic-instruct/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.338, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,12 +47,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -92,12 +104,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -171,13 +192,22 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -221,14 +251,23 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -272,20 +311,29 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -329,24 +377,33 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -390,13 +447,22 @@ } }, "generation_config": { - "stop": "none" + "additional_details": { + "stop": "none" + } } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -440,19 +506,28 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -495,12 +570,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -544,13 +628,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/tiiuae/falcon-40b/346c2a85-3daf-41e9-9305-78851dcf05ae.json b/data/helm_lite/tiiuae/falcon-40b/2abf3bb8-a78f-4a59-807e-52da4e6426fd.json similarity index 76% rename from data/helm_lite/tiiuae/falcon-40b/346c2a85-3daf-41e9-9305-78851dcf05ae.json rename to data/helm_lite/tiiuae/falcon-40b/2abf3bb8-a78f-4a59-807e-52da4e6426fd.json index 518458e37..2bf240f96 100644 --- a/data/helm_lite/tiiuae/falcon-40b/346c2a85-3daf-41e9-9305-78851dcf05ae.json +++ b/data/helm_lite/tiiuae/falcon-40b/2abf3bb8-a78f-4a59-807e-52da4e6426fd.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/tiiuae_falcon-40b/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/tiiuae_falcon-40b/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Falcon (40B)", + "name": "Falcon 40B", "id": "tiiuae/falcon-40b", "developer": "tiiuae", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.217, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,12 +47,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -92,12 +104,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -171,13 +192,22 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -221,14 +251,23 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -272,20 +311,29 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -329,24 +377,33 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -389,12 +446,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -438,19 +504,28 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -493,12 +568,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/tiiuae/falcon-7b/69e02d7b-d536-4ff4-a58e-b880ff87f357.json b/data/helm_lite/tiiuae/falcon-7b/ae28615a-b7fa-4782-89e1-4b8e4804dc62.json similarity index 76% rename from data/helm_lite/tiiuae/falcon-7b/69e02d7b-d536-4ff4-a58e-b880ff87f357.json rename to data/helm_lite/tiiuae/falcon-7b/ae28615a-b7fa-4782-89e1-4b8e4804dc62.json index 4a1515414..9a704269c 100644 --- a/data/helm_lite/tiiuae/falcon-7b/69e02d7b-d536-4ff4-a58e-b880ff87f357.json +++ b/data/helm_lite/tiiuae/falcon-7b/ae28615a-b7fa-4782-89e1-4b8e4804dc62.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/tiiuae_falcon-7b/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/tiiuae_falcon-7b/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Falcon (7B)", + "name": "Falcon 7B", "id": "tiiuae/falcon-7b", "developer": "tiiuae", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.064, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,12 +47,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -92,12 +104,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -171,13 +192,22 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -221,14 +251,23 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -272,20 +311,29 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -329,24 +377,33 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -389,12 +446,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -438,19 +504,28 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -493,12 +568,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/upstage/solar-pro-241126/3286a69f-cdba-49a5-939a-e14ad759e7a4.json b/data/helm_lite/upstage/solar-pro-241126/52bb6ab9-e80b-4bf0-a375-7706f16d311d.json similarity index 76% rename from data/helm_lite/upstage/solar-pro-241126/3286a69f-cdba-49a5-939a-e14ad759e7a4.json rename to data/helm_lite/upstage/solar-pro-241126/52bb6ab9-e80b-4bf0-a375-7706f16d311d.json index fd33bd463..1f111d01c 100644 --- a/data/helm_lite/upstage/solar-pro-241126/3286a69f-cdba-49a5-939a-e14ad759e7a4.json +++ b/data/helm_lite/upstage/solar-pro-241126/52bb6ab9-e80b-4bf0-a375-7706f16d311d.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/upstage_solar-pro-241126/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/upstage_solar-pro-241126/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.602, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,12 +47,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -92,12 +104,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -171,13 +192,22 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -221,14 +251,23 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -272,20 +311,29 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -329,24 +377,33 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -390,13 +447,22 @@ } }, "generation_config": { - "stop": "none" + "additional_details": { + "stop": "none" + } } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -440,19 +506,28 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -495,12 +570,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -544,13 +628,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/writer/palmyra-x-004/b798adc1-01f0-46c5-95a4-8b67199d624b.json b/data/helm_lite/writer/palmyra-x-004/fcf9d3dd-8b31-4ab9-98c4-d2712eebf867.json similarity index 76% rename from data/helm_lite/writer/palmyra-x-004/b798adc1-01f0-46c5-95a4-8b67199d624b.json rename to data/helm_lite/writer/palmyra-x-004/fcf9d3dd-8b31-4ab9-98c4-d2712eebf867.json index 574c20cd8..8026be475 100644 --- a/data/helm_lite/writer/palmyra-x-004/b798adc1-01f0-46c5-95a4-8b67199d624b.json +++ b/data/helm_lite/writer/palmyra-x-004/fcf9d3dd-8b31-4ab9-98c4-d2712eebf867.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/writer_palmyra-x-004/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/writer_palmyra-x-004/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.808, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,12 +47,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -93,13 +105,22 @@ } }, "generation_config": { - "stop": "none" + "additional_details": { + "stop": "none" + } } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -173,14 +194,23 @@ } }, "generation_config": { - "mode": "closedbook", - "stop": "none" + "additional_details": { + "mode": "closedbook", + "stop": "none" + } } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -224,14 +254,23 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -275,20 +314,29 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -332,25 +380,34 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True", - "stop": "none" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True", + "stop": "none" + } } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -394,13 +451,22 @@ } }, "generation_config": { - "stop": "none" + "additional_details": { + "stop": "none" + } } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -444,20 +510,29 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ], - "stop": "none" + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ], + "stop": "none" + } } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -500,12 +575,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -549,14 +633,16 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ], - "stop": "none" + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ], + "stop": "none" + } } } ] diff --git a/data/helm_lite/writer/palmyra-x-v2/7a07a202-aa88-47fc-987d-6d44a57b6985.json b/data/helm_lite/writer/palmyra-x-v2/1158720a-9a0e-492e-a677-9b0936f4cde5.json similarity index 76% rename from data/helm_lite/writer/palmyra-x-v2/7a07a202-aa88-47fc-987d-6d44a57b6985.json rename to data/helm_lite/writer/palmyra-x-v2/1158720a-9a0e-492e-a677-9b0936f4cde5.json index 85f887f2f..5e5faf9fb 100644 --- a/data/helm_lite/writer/palmyra-x-v2/7a07a202-aa88-47fc-987d-6d44a57b6985.json +++ b/data/helm_lite/writer/palmyra-x-v2/1158720a-9a0e-492e-a677-9b0936f4cde5.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/writer_palmyra-x-v2/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/writer_palmyra-x-v2/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Palmyra X V2 (33B)", + "name": "Palmyra X V2 33B", "id": "writer/palmyra-x-v2", "developer": "writer", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.589, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,12 +47,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -92,12 +104,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -171,13 +192,22 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -221,14 +251,23 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -272,20 +311,29 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -329,24 +377,33 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -389,12 +446,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -438,19 +504,28 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -493,12 +568,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_lite/writer/palmyra-x-v3/ac0a7249-11e7-493d-9190-8c1913bb1c42.json b/data/helm_lite/writer/palmyra-x-v3/254ded81-4051-420d-b402-2e7b80a23848.json similarity index 76% rename from data/helm_lite/writer/palmyra-x-v3/ac0a7249-11e7-493d-9190-8c1913bb1c42.json rename to data/helm_lite/writer/palmyra-x-v3/254ded81-4051-420d-b402-2e7b80a23848.json index ae69f6c5b..c8073d254 100644 --- a/data/helm_lite/writer/palmyra-x-v3/ac0a7249-11e7-493d-9190-8c1913bb1c42.json +++ b/data/helm_lite/writer/palmyra-x-v3/254ded81-4051-420d-b402-2e7b80a23848.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_lite/writer_palmyra-x-v3/1767657482.092302", - "retrieved_timestamp": "1767657482.092302", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_lite/writer_palmyra-x-v3/1770834614.1822479", + "retrieved_timestamp": "1770834614.1822479", "source_metadata": { "source_name": "helm_lite", "source_type": "documentation", @@ -12,7 +9,7 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Palmyra X V3 (72B)", + "name": "Palmyra X V3 72B", "id": "writer/palmyra-x-v3", "developer": "writer", "inference_platform": "unknown" @@ -20,6 +17,13 @@ "evaluation_results": [ { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_lite", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -30,7 +34,6 @@ "score_details": { "score": 0.679, "details": { - "description": null, "tab": "Accuracy", "Mean win rate - Efficiency": { "description": null, @@ -44,12 +47,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NarrativeQA - F1", + "evaluation_name": "NarrativeQA", + "source_data": { + "dataset_name": "NarrativeQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -92,12 +104,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "NaturalQuestions (closed-book) - F1", + "evaluation_name": "NaturalQuestions (closed-book)", + "source_data": { + "dataset_name": "NaturalQuestions (closed-book)", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -171,13 +192,22 @@ } }, "generation_config": { - "mode": "closedbook" + "additional_details": { + "mode": "closedbook" + } } }, { - "evaluation_name": "OpenbookQA - EM", + "evaluation_name": "OpenbookQA", + "source_data": { + "dataset_name": "OpenbookQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -221,14 +251,23 @@ } }, "generation_config": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" + "additional_details": { + "dataset": "openbookqa", + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MMLU - EM", + "evaluation_name": "MMLU", + "source_data": { + "dataset_name": "MMLU", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -272,20 +311,29 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" + "additional_details": { + "subject": [ + "abstract_algebra", + "college_chemistry", + "computer_security", + "econometrics", + "us_foreign_policy" + ], + "method": "multiple_choice_joint" + } } }, { - "evaluation_name": "MATH - Equivalent (CoT)", + "evaluation_name": "MATH", + "source_data": { + "dataset_name": "MATH", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -329,24 +377,33 @@ } }, "generation_config": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" + "additional_details": { + "subject": [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus" + ], + "level": "1", + "use_official_examples": "False", + "use_chain_of_thought": "True" + } } }, { - "evaluation_name": "GSM8K - EM", + "evaluation_name": "GSM8K", + "source_data": { + "dataset_name": "GSM8K", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -389,12 +446,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "LegalBench - EM", + "evaluation_name": "LegalBench", + "source_data": { + "dataset_name": "LegalBench", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -438,19 +504,28 @@ } }, "generation_config": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] + "additional_details": { + "subset": [ + "abercrombie", + "corporate_lobbying", + "function_of_decision_section", + "international_citizenship_questions", + "proa" + ] + } } }, { - "evaluation_name": "MedQA - EM", + "evaluation_name": "MedQA", + "source_data": { + "dataset_name": "MedQA", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -493,12 +568,21 @@ } } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } }, { - "evaluation_name": "WMT 2014 - BLEU-4", + "evaluation_name": "WMT 2014", + "source_data": { + "dataset_name": "WMT 2014", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + ] + }, "metric_config": { - "evaluation_description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -542,13 +626,15 @@ } }, "generation_config": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] + "additional_details": { + "language_pair": [ + "cs-en", + "de-en", + "fr-en", + "hi-en", + "ru-en" + ] + } } } ] diff --git a/data/helm_mmlu/01-ai/yi-34b/73d9f70c-acbb-4dfa-ae8e-e5c4f6b74c9a.json b/data/helm_mmlu/01-ai/yi-34b/ce5acf2d-e5c6-42b8-ac8e-622a755300b8.json similarity index 78% rename from data/helm_mmlu/01-ai/yi-34b/73d9f70c-acbb-4dfa-ae8e-e5c4f6b74c9a.json rename to data/helm_mmlu/01-ai/yi-34b/ce5acf2d-e5c6-42b8-ac8e-622a755300b8.json index a5bdb42fc..a5d4de71f 100644 --- a/data/helm_mmlu/01-ai/yi-34b/73d9f70c-acbb-4dfa-ae8e-e5c4f6b74c9a.json +++ b/data/helm_mmlu/01-ai/yi-34b/ce5acf2d-e5c6-42b8-ac8e-622a755300b8.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/01-ai_yi-34b/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/01-ai_yi-34b/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,16 +9,23 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Yi (34B)", + "name": "Yi 34B", "id": "01-ai/yi-34b", "developer": "01-ai", "inference_platform": "unknown" }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -65,132 +69,141 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -234,16 +247,25 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -287,16 +309,25 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -490,16 +521,25 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -543,16 +583,25 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -596,16 +645,25 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -649,16 +707,25 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -702,16 +769,25 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -755,16 +831,25 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -898,16 +983,25 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,16 +1045,25 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1004,16 +1107,25 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1057,16 +1169,25 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1110,16 +1231,25 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1163,16 +1293,25 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1216,16 +1355,25 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1269,16 +1417,25 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1322,16 +1479,25 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1765,16 +1931,25 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1848,16 +2023,25 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1901,16 +2085,25 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1954,16 +2147,25 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2007,16 +2209,25 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2060,16 +2271,25 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2113,16 +2333,25 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2166,16 +2395,25 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2219,16 +2457,25 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2302,16 +2549,25 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2355,16 +2611,25 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2408,16 +2673,25 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2461,16 +2735,25 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2514,16 +2797,25 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2567,16 +2859,25 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2921,25 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.315, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/01-ai/yi-6b/97569bf5-1e12-4baa-80cc-019be1725ebb.json b/data/helm_mmlu/01-ai/yi-6b/7f2975a3-1cd5-4102-bb0c-f0f329db9d2d.json similarity index 78% rename from data/helm_mmlu/01-ai/yi-6b/97569bf5-1e12-4baa-80cc-019be1725ebb.json rename to data/helm_mmlu/01-ai/yi-6b/7f2975a3-1cd5-4102-bb0c-f0f329db9d2d.json index 5b2c50278..1f0a7e20f 100644 --- a/data/helm_mmlu/01-ai/yi-6b/97569bf5-1e12-4baa-80cc-019be1725ebb.json +++ b/data/helm_mmlu/01-ai/yi-6b/7f2975a3-1cd5-4102-bb0c-f0f329db9d2d.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/01-ai_yi-6b/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/01-ai_yi-6b/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,16 +9,23 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Yi (6B)", + "name": "Yi 6B", "id": "01-ai/yi-6b", "developer": "01-ai", "inference_platform": "unknown" }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -65,132 +69,141 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -234,16 +247,25 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -287,16 +309,25 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -490,16 +521,25 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -543,16 +583,25 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -596,16 +645,25 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -649,16 +707,25 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -702,16 +769,25 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -755,16 +831,25 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -898,16 +983,25 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,16 +1045,25 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1004,16 +1107,25 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1057,16 +1169,25 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1110,16 +1231,25 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1163,16 +1293,25 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1216,16 +1355,25 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1269,16 +1417,25 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1322,16 +1479,25 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1765,16 +1931,25 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1848,16 +2023,25 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1901,16 +2085,25 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1954,16 +2147,25 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2007,16 +2209,25 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2060,16 +2271,25 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2113,16 +2333,25 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2166,16 +2395,25 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2219,16 +2457,25 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2302,16 +2549,25 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2355,16 +2611,25 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2408,16 +2673,25 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2461,16 +2735,25 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2514,16 +2797,25 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2567,16 +2859,25 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2921,25 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.651, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/01-ai/yi-large-preview/7c4b387f-45be-41cb-8102-cd738e60f99d.json b/data/helm_mmlu/01-ai/yi-large-preview/5b5508aa-2956-4a38-84e2-c50b9ce08dc9.json similarity index 78% rename from data/helm_mmlu/01-ai/yi-large-preview/7c4b387f-45be-41cb-8102-cd738e60f99d.json rename to data/helm_mmlu/01-ai/yi-large-preview/5b5508aa-2956-4a38-84e2-c50b9ce08dc9.json index 938fbc9f2..4838cda1c 100644 --- a/data/helm_mmlu/01-ai/yi-large-preview/7c4b387f-45be-41cb-8102-cd738e60f99d.json +++ b/data/helm_mmlu/01-ai/yi-large-preview/5b5508aa-2956-4a38-84e2-c50b9ce08dc9.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/01-ai_yi-large-preview/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/01-ai_yi-large-preview/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,16 +9,23 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Yi Large (Preview)", + "name": "Yi Large Preview", "id": "01-ai/yi-large-preview", "developer": "01-ai", "inference_platform": "unknown" }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -65,132 +69,141 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -234,16 +247,25 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -287,16 +309,25 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -490,16 +521,25 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -543,16 +583,25 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -596,16 +645,25 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -649,16 +707,25 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -702,16 +769,25 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -755,16 +831,25 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -898,16 +983,25 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,16 +1045,25 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1004,16 +1107,25 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1057,16 +1169,25 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1110,16 +1231,25 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1163,16 +1293,25 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1216,16 +1355,25 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1269,16 +1417,25 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1322,16 +1479,25 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1765,16 +1931,25 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1848,16 +2023,25 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1901,16 +2085,25 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1954,16 +2147,25 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2007,16 +2209,25 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2060,16 +2271,25 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2113,16 +2333,25 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2166,16 +2395,25 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2219,16 +2457,25 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2302,16 +2549,25 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2355,16 +2611,25 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2408,16 +2673,25 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2461,16 +2735,25 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2514,16 +2797,25 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2567,16 +2859,25 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2921,25 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.258, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/ai21/jamba-1.5-large/027b7bd4-8943-4d2c-9674-15d33792d391.json b/data/helm_mmlu/ai21/jamba-1.5-large/0e14f2da-72a0-451a-ad35-d8ecd9e27d3f.json similarity index 78% rename from data/helm_mmlu/ai21/jamba-1.5-large/027b7bd4-8943-4d2c-9674-15d33792d391.json rename to data/helm_mmlu/ai21/jamba-1.5-large/0e14f2da-72a0-451a-ad35-d8ecd9e27d3f.json index b05362e32..45536e1a1 100644 --- a/data/helm_mmlu/ai21/jamba-1.5-large/027b7bd4-8943-4d2c-9674-15d33792d391.json +++ b/data/helm_mmlu/ai21/jamba-1.5-large/0e14f2da-72a0-451a-ad35-d8ecd9e27d3f.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/ai21_jamba-1.5-large/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/ai21_jamba-1.5-large/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -19,9 +16,16 @@ }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -65,132 +69,141 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -234,16 +247,25 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -287,16 +309,25 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -490,16 +521,25 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -543,16 +583,25 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -596,16 +645,25 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -649,16 +707,25 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -702,16 +769,25 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -755,16 +831,25 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -898,16 +983,25 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,16 +1045,25 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1004,16 +1107,25 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1057,16 +1169,25 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1110,16 +1231,25 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1163,16 +1293,25 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1216,16 +1355,25 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1269,16 +1417,25 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1322,16 +1479,25 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1765,16 +1931,25 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1848,16 +2023,25 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1901,16 +2085,25 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1954,16 +2147,25 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2007,16 +2209,25 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2060,16 +2271,25 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2113,16 +2333,25 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2166,16 +2395,25 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2219,16 +2457,25 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2302,16 +2549,25 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2355,16 +2611,25 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2408,16 +2673,25 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2461,16 +2735,25 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2514,16 +2797,25 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2567,16 +2859,25 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2921,25 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.147, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/ai21/jamba-1.5-mini/e5ed6c70-6874-4671-abb0-25bbd82471b4.json b/data/helm_mmlu/ai21/jamba-1.5-mini/92e0b1b9-c167-4e07-b770-2b78527eb4eb.json similarity index 78% rename from data/helm_mmlu/ai21/jamba-1.5-mini/e5ed6c70-6874-4671-abb0-25bbd82471b4.json rename to data/helm_mmlu/ai21/jamba-1.5-mini/92e0b1b9-c167-4e07-b770-2b78527eb4eb.json index 374350118..727c60261 100644 --- a/data/helm_mmlu/ai21/jamba-1.5-mini/e5ed6c70-6874-4671-abb0-25bbd82471b4.json +++ b/data/helm_mmlu/ai21/jamba-1.5-mini/92e0b1b9-c167-4e07-b770-2b78527eb4eb.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/ai21_jamba-1.5-mini/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/ai21_jamba-1.5-mini/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -19,9 +16,16 @@ }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -65,132 +69,141 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -234,16 +247,25 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -287,16 +309,25 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -490,16 +521,25 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -543,16 +583,25 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -596,16 +645,25 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -649,16 +707,25 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -702,16 +769,25 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -755,16 +831,25 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -898,16 +983,25 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,16 +1045,25 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1004,16 +1107,25 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1057,16 +1169,25 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1110,16 +1231,25 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1163,16 +1293,25 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1216,16 +1355,25 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1269,16 +1417,25 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1322,16 +1479,25 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1765,16 +1931,25 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1848,16 +2023,25 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1901,16 +2085,25 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1954,16 +2147,25 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2007,16 +2209,25 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2060,16 +2271,25 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2113,16 +2333,25 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2166,16 +2395,25 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2219,16 +2457,25 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2302,16 +2549,25 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2355,16 +2611,25 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2408,16 +2673,25 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2461,16 +2735,25 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2514,16 +2797,25 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2567,16 +2859,25 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2921,25 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.206, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/ai21/jamba-instruct/4e236f80-5d03-4547-b199-b8718439fbed.json b/data/helm_mmlu/ai21/jamba-instruct/3da06ad4-0770-45f5-a6a2-9ef9500cef05.json similarity index 78% rename from data/helm_mmlu/ai21/jamba-instruct/4e236f80-5d03-4547-b199-b8718439fbed.json rename to data/helm_mmlu/ai21/jamba-instruct/3da06ad4-0770-45f5-a6a2-9ef9500cef05.json index 2f32db71e..3a25316d9 100644 --- a/data/helm_mmlu/ai21/jamba-instruct/4e236f80-5d03-4547-b199-b8718439fbed.json +++ b/data/helm_mmlu/ai21/jamba-instruct/3da06ad4-0770-45f5-a6a2-9ef9500cef05.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/ai21_jamba-instruct/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/ai21_jamba-instruct/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -19,9 +16,16 @@ }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -65,132 +69,141 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -234,16 +247,25 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -287,16 +309,25 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -490,16 +521,25 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -543,16 +583,25 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -596,16 +645,25 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -649,16 +707,25 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -702,16 +769,25 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -755,16 +831,25 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -898,16 +983,25 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,16 +1045,25 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1004,16 +1107,25 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1057,16 +1169,25 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1110,16 +1231,25 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1163,16 +1293,25 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1216,16 +1355,25 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1269,16 +1417,25 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1322,16 +1479,25 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1765,16 +1931,25 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1848,16 +2023,25 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1901,16 +2085,25 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1954,16 +2147,25 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2007,16 +2209,25 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2060,16 +2271,25 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2113,16 +2333,25 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2166,16 +2395,25 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2219,16 +2457,25 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2302,16 +2549,25 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2355,16 +2611,25 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2408,16 +2673,25 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2461,16 +2735,25 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2514,16 +2797,25 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2567,16 +2859,25 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2921,25 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.887, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/allenai/olmo-1.7-7b/1fffb281-ad0f-4e46-9e18-f7e6643f9f28.json b/data/helm_mmlu/allenai/olmo-1.7-7b/c1c79360-60bd-4f5d-a746-e0411b94f69b.json similarity index 78% rename from data/helm_mmlu/allenai/olmo-1.7-7b/1fffb281-ad0f-4e46-9e18-f7e6643f9f28.json rename to data/helm_mmlu/allenai/olmo-1.7-7b/c1c79360-60bd-4f5d-a746-e0411b94f69b.json index 0ee329ec3..8bf036c64 100644 --- a/data/helm_mmlu/allenai/olmo-1.7-7b/1fffb281-ad0f-4e46-9e18-f7e6643f9f28.json +++ b/data/helm_mmlu/allenai/olmo-1.7-7b/c1c79360-60bd-4f5d-a746-e0411b94f69b.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/allenai_olmo-1.7-7b/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/allenai_olmo-1.7-7b/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,16 +9,23 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "OLMo 1.7 (7B)", + "name": "OLMo 1.7 7B", "id": "allenai/olmo-1.7-7b", "developer": "allenai", "inference_platform": "unknown" }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -65,132 +69,141 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -234,16 +247,25 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -287,16 +309,25 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -490,16 +521,25 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -543,16 +583,25 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -596,16 +645,25 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -649,16 +707,25 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -702,16 +769,25 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -755,16 +831,25 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -898,16 +983,25 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,16 +1045,25 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1004,16 +1107,25 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1057,16 +1169,25 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1110,16 +1231,25 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1163,16 +1293,25 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1216,16 +1355,25 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1269,16 +1417,25 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1322,16 +1479,25 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1765,16 +1931,25 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1848,16 +2023,25 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1901,16 +2085,25 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1954,16 +2147,25 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2007,16 +2209,25 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2060,16 +2271,25 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2113,16 +2333,25 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2166,16 +2395,25 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2219,16 +2457,25 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2302,16 +2549,25 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2355,16 +2611,25 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2408,16 +2673,25 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2461,16 +2735,25 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2514,16 +2797,25 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2567,16 +2859,25 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2921,25 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.196, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/allenai/olmo-7b/31666792-6d68-42da-95f8-3b9f8590c7fd.json b/data/helm_mmlu/allenai/olmo-7b/bb904716-048c-4b41-9f64-4d17c485afe3.json similarity index 78% rename from data/helm_mmlu/allenai/olmo-7b/31666792-6d68-42da-95f8-3b9f8590c7fd.json rename to data/helm_mmlu/allenai/olmo-7b/bb904716-048c-4b41-9f64-4d17c485afe3.json index dc71abcb3..2b8d4cdfb 100644 --- a/data/helm_mmlu/allenai/olmo-7b/31666792-6d68-42da-95f8-3b9f8590c7fd.json +++ b/data/helm_mmlu/allenai/olmo-7b/bb904716-048c-4b41-9f64-4d17c485afe3.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/allenai_olmo-7b/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/allenai_olmo-7b/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,16 +9,23 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "OLMo (7B)", + "name": "OLMo 7B", "id": "allenai/olmo-7b", "developer": "allenai", "inference_platform": "unknown" }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -65,132 +69,141 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -234,16 +247,25 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -287,16 +309,25 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -490,16 +521,25 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -543,16 +583,25 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -596,16 +645,25 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -649,16 +707,25 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -702,16 +769,25 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -755,16 +831,25 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -898,16 +983,25 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,16 +1045,25 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1004,16 +1107,25 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1057,16 +1169,25 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1110,16 +1231,25 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1163,16 +1293,25 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1216,16 +1355,25 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1269,16 +1417,25 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1322,16 +1479,25 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1765,16 +1931,25 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1848,16 +2023,25 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1901,16 +2085,25 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1954,16 +2147,25 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2007,16 +2209,25 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2060,16 +2271,25 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2113,16 +2333,25 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2166,16 +2395,25 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2219,16 +2457,25 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2302,16 +2549,25 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2355,16 +2611,25 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2408,16 +2673,25 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2461,16 +2735,25 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2514,16 +2797,25 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2567,16 +2859,25 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2921,25 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.68, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/amazon/nova-lite-v1:0/c77fc3bf-1481-46c2-8f29-9930e42c4567.json b/data/helm_mmlu/amazon/nova-lite-v1_0/063bd04d-e0d8-426a-a56a-062f7bc1a4e4.json similarity index 78% rename from data/helm_mmlu/amazon/nova-lite-v1:0/c77fc3bf-1481-46c2-8f29-9930e42c4567.json rename to data/helm_mmlu/amazon/nova-lite-v1_0/063bd04d-e0d8-426a-a56a-062f7bc1a4e4.json index 036d68cdd..1bb99dccc 100644 --- a/data/helm_mmlu/amazon/nova-lite-v1:0/c77fc3bf-1481-46c2-8f29-9930e42c4567.json +++ b/data/helm_mmlu/amazon/nova-lite-v1_0/063bd04d-e0d8-426a-a56a-062f7bc1a4e4.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/amazon_nova-lite-v1:0/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/amazon_nova-lite-v1:0/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -19,9 +16,16 @@ }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -65,132 +69,141 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -234,16 +247,25 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -287,16 +309,25 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -490,16 +521,25 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -543,16 +583,25 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -596,16 +645,25 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -649,16 +707,25 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -702,16 +769,25 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -755,16 +831,25 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -898,16 +983,25 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,16 +1045,25 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1004,16 +1107,25 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1057,16 +1169,25 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1110,16 +1231,25 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1163,16 +1293,25 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1216,16 +1355,25 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1269,16 +1417,25 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1322,16 +1479,25 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1765,16 +1931,25 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1848,16 +2023,25 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1901,16 +2085,25 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1954,16 +2147,25 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2007,16 +2209,25 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2060,16 +2271,25 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2113,16 +2333,25 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2166,16 +2395,25 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2219,16 +2457,25 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2302,16 +2549,25 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2355,16 +2611,25 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2408,16 +2673,25 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2461,16 +2735,25 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2514,16 +2797,25 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2567,16 +2859,25 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2921,25 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.987, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/amazon/nova-micro-v1:0/1ca3812c-50a8-455c-b2dc-54cca6ec8123.json b/data/helm_mmlu/amazon/nova-micro-v1_0/c8949c55-8987-4ed3-b74b-8b13b4381806.json similarity index 78% rename from data/helm_mmlu/amazon/nova-micro-v1:0/1ca3812c-50a8-455c-b2dc-54cca6ec8123.json rename to data/helm_mmlu/amazon/nova-micro-v1_0/c8949c55-8987-4ed3-b74b-8b13b4381806.json index dc2e53d31..ab9b8c843 100644 --- a/data/helm_mmlu/amazon/nova-micro-v1:0/1ca3812c-50a8-455c-b2dc-54cca6ec8123.json +++ b/data/helm_mmlu/amazon/nova-micro-v1_0/c8949c55-8987-4ed3-b74b-8b13b4381806.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/amazon_nova-micro-v1:0/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/amazon_nova-micro-v1:0/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -19,9 +16,16 @@ }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -65,132 +69,141 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -234,16 +247,25 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -287,16 +309,25 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -490,16 +521,25 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -543,16 +583,25 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -596,16 +645,25 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -649,16 +707,25 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -702,16 +769,25 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -755,16 +831,25 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -898,16 +983,25 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,16 +1045,25 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1004,16 +1107,25 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1057,16 +1169,25 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1110,16 +1231,25 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1163,16 +1293,25 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1216,16 +1355,25 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1269,16 +1417,25 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1322,16 +1479,25 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1765,16 +1931,25 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1848,16 +2023,25 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1901,16 +2085,25 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1954,16 +2147,25 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2007,16 +2209,25 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2060,16 +2271,25 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2113,16 +2333,25 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2166,16 +2395,25 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2219,16 +2457,25 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2302,16 +2549,25 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2355,16 +2611,25 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2408,16 +2673,25 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2461,16 +2735,25 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2514,16 +2797,25 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2567,16 +2859,25 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2921,25 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 1.0, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/amazon/nova-pro-v1:0/28265def-113d-4e90-9ba9-02dfe86f5ad2.json b/data/helm_mmlu/amazon/nova-pro-v1_0/ecfa0e26-edff-46e4-8954-6f07a0e6fca0.json similarity index 78% rename from data/helm_mmlu/amazon/nova-pro-v1:0/28265def-113d-4e90-9ba9-02dfe86f5ad2.json rename to data/helm_mmlu/amazon/nova-pro-v1_0/ecfa0e26-edff-46e4-8954-6f07a0e6fca0.json index 74dd04dc4..af30c4448 100644 --- a/data/helm_mmlu/amazon/nova-pro-v1:0/28265def-113d-4e90-9ba9-02dfe86f5ad2.json +++ b/data/helm_mmlu/amazon/nova-pro-v1_0/ecfa0e26-edff-46e4-8954-6f07a0e6fca0.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/amazon_nova-pro-v1:0/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/amazon_nova-pro-v1:0/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -19,9 +16,16 @@ }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -65,132 +69,141 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -234,16 +247,25 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -287,16 +309,25 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -490,16 +521,25 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -543,16 +583,25 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -596,16 +645,25 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -649,16 +707,25 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -702,16 +769,25 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -755,16 +831,25 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -898,16 +983,25 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,16 +1045,25 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1004,16 +1107,25 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1057,16 +1169,25 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1110,16 +1231,25 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1163,16 +1293,25 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1216,16 +1355,25 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1269,16 +1417,25 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1322,16 +1479,25 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1765,16 +1931,25 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1848,16 +2023,25 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1901,16 +2085,25 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1954,16 +2147,25 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2007,16 +2209,25 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2060,16 +2271,25 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2113,16 +2333,25 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2166,16 +2395,25 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2219,16 +2457,25 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2302,16 +2549,25 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2355,16 +2611,25 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2408,16 +2673,25 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2461,16 +2735,25 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2514,16 +2797,25 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2567,16 +2859,25 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2921,25 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.975, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/anthropic/claude-2.1/357edc36-d500-4e6e-94a4-6653b769b5d8.json b/data/helm_mmlu/anthropic/claude-2.1/bc9cedd7-5cb2-44b2-abda-470322570e14.json similarity index 78% rename from data/helm_mmlu/anthropic/claude-2.1/357edc36-d500-4e6e-94a4-6653b769b5d8.json rename to data/helm_mmlu/anthropic/claude-2.1/bc9cedd7-5cb2-44b2-abda-470322570e14.json index 94c86600d..c2616d7f8 100644 --- a/data/helm_mmlu/anthropic/claude-2.1/357edc36-d500-4e6e-94a4-6653b769b5d8.json +++ b/data/helm_mmlu/anthropic/claude-2.1/bc9cedd7-5cb2-44b2-abda-470322570e14.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/anthropic_claude-2.1/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/anthropic_claude-2.1/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -19,9 +16,16 @@ }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -65,132 +69,141 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -234,16 +247,25 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -287,16 +309,25 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -490,16 +521,25 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -543,16 +583,25 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -596,16 +645,25 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -649,16 +707,25 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -702,16 +769,25 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -755,16 +831,25 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -898,16 +983,25 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,16 +1045,25 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1004,16 +1107,25 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1057,16 +1169,25 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1110,16 +1231,25 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1163,16 +1293,25 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1216,16 +1355,25 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1269,16 +1417,25 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1322,16 +1479,25 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1765,16 +1931,25 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1848,16 +2023,25 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1901,16 +2085,25 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1954,16 +2147,25 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2007,16 +2209,25 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2060,16 +2271,25 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2113,16 +2333,25 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2166,16 +2395,25 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2219,16 +2457,25 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2302,16 +2549,25 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2355,16 +2611,25 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2408,16 +2673,25 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2461,16 +2735,25 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2514,16 +2797,25 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2567,16 +2859,25 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2921,25 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.048, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/anthropic/claude-3-5-haiku-20241022/67f72a7f-15b7-4a2e-b478-38091cba2189.json b/data/helm_mmlu/anthropic/claude-3-5-haiku-20241022/305a7f25-6e22-4146-9678-6a687a701567.json similarity index 78% rename from data/helm_mmlu/anthropic/claude-3-5-haiku-20241022/67f72a7f-15b7-4a2e-b478-38091cba2189.json rename to data/helm_mmlu/anthropic/claude-3-5-haiku-20241022/305a7f25-6e22-4146-9678-6a687a701567.json index 15ba960b1..76628bf51 100644 --- a/data/helm_mmlu/anthropic/claude-3-5-haiku-20241022/67f72a7f-15b7-4a2e-b478-38091cba2189.json +++ b/data/helm_mmlu/anthropic/claude-3-5-haiku-20241022/305a7f25-6e22-4146-9678-6a687a701567.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/anthropic_claude-3-5-haiku-20241022/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/anthropic_claude-3-5-haiku-20241022/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,16 +9,23 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Claude 3.5 Haiku (20241022)", + "name": "Claude 3.5 Haiku 20241022", "id": "anthropic/claude-3-5-haiku-20241022", "developer": "anthropic", "inference_platform": "unknown" }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -65,132 +69,141 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -234,16 +247,25 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -287,16 +309,25 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -490,16 +521,25 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -543,16 +583,25 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -596,16 +645,25 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -649,16 +707,25 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -702,16 +769,25 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -755,16 +831,25 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -898,16 +983,25 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,16 +1045,25 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1004,16 +1107,25 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1057,16 +1169,25 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1110,16 +1231,25 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1163,16 +1293,25 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1216,16 +1355,25 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1269,16 +1417,25 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1322,16 +1479,25 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1765,16 +1931,25 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1848,16 +2023,25 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1901,16 +2085,25 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1954,16 +2147,25 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2007,16 +2209,25 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2060,16 +2271,25 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2113,16 +2333,25 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2166,16 +2395,25 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2219,16 +2457,25 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2302,16 +2549,25 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2355,16 +2611,25 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2408,16 +2673,25 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2461,16 +2735,25 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2514,16 +2797,25 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2567,16 +2859,25 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2921,25 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.128, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/anthropic/claude-3-5-sonnet-20240620/3aeb81a2-9e35-4fbc-ab31-d94cffc5d17d.json b/data/helm_mmlu/anthropic/claude-3-5-sonnet-20240620/c6059976-85a1-40ce-b02f-67e182aa2f7d.json similarity index 78% rename from data/helm_mmlu/anthropic/claude-3-5-sonnet-20240620/3aeb81a2-9e35-4fbc-ab31-d94cffc5d17d.json rename to data/helm_mmlu/anthropic/claude-3-5-sonnet-20240620/c6059976-85a1-40ce-b02f-67e182aa2f7d.json index 43e320af9..9d9557efc 100644 --- a/data/helm_mmlu/anthropic/claude-3-5-sonnet-20240620/3aeb81a2-9e35-4fbc-ab31-d94cffc5d17d.json +++ b/data/helm_mmlu/anthropic/claude-3-5-sonnet-20240620/c6059976-85a1-40ce-b02f-67e182aa2f7d.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/anthropic_claude-3-5-sonnet-20240620/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/anthropic_claude-3-5-sonnet-20240620/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,16 +9,23 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Claude 3.5 Sonnet (20240620)", + "name": "Claude 3.5 Sonnet 20240620", "id": "anthropic/claude-3-5-sonnet-20240620", "developer": "anthropic", "inference_platform": "unknown" }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -65,132 +69,141 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -234,16 +247,25 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -287,16 +309,25 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -490,16 +521,25 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -543,16 +583,25 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -596,16 +645,25 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -649,16 +707,25 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -702,16 +769,25 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -755,16 +831,25 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -898,16 +983,25 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,16 +1045,25 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1004,16 +1107,25 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1057,16 +1169,25 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1110,16 +1231,25 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1163,16 +1293,25 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1216,16 +1355,25 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1269,16 +1417,25 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1322,16 +1479,25 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1765,16 +1931,25 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1848,16 +2023,25 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1901,16 +2085,25 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1954,16 +2147,25 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2007,16 +2209,25 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2060,16 +2271,25 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2113,16 +2333,25 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2166,16 +2395,25 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2219,16 +2457,25 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2302,16 +2549,25 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2355,16 +2611,25 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2408,16 +2673,25 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2461,16 +2735,25 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2514,16 +2797,25 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2567,16 +2859,25 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2921,25 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.17, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/anthropic/claude-3-5-sonnet-20241022/f8f66e38-00b1-4150-84bf-466ffc8ce6a2.json b/data/helm_mmlu/anthropic/claude-3-5-sonnet-20241022/6a59feac-f2d5-4eaf-a440-036b0acfbfc0.json similarity index 78% rename from data/helm_mmlu/anthropic/claude-3-5-sonnet-20241022/f8f66e38-00b1-4150-84bf-466ffc8ce6a2.json rename to data/helm_mmlu/anthropic/claude-3-5-sonnet-20241022/6a59feac-f2d5-4eaf-a440-036b0acfbfc0.json index 7df36bb32..35be68aa6 100644 --- a/data/helm_mmlu/anthropic/claude-3-5-sonnet-20241022/f8f66e38-00b1-4150-84bf-466ffc8ce6a2.json +++ b/data/helm_mmlu/anthropic/claude-3-5-sonnet-20241022/6a59feac-f2d5-4eaf-a440-036b0acfbfc0.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/anthropic_claude-3-5-sonnet-20241022/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/anthropic_claude-3-5-sonnet-20241022/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,16 +9,23 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Claude 3.5 Sonnet (20241022)", + "name": "Claude 3.5 Sonnet 20241022", "id": "anthropic/claude-3-5-sonnet-20241022", "developer": "anthropic", "inference_platform": "unknown" }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -65,132 +69,141 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -234,16 +247,25 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -287,16 +309,25 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -490,16 +521,25 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -543,16 +583,25 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -596,16 +645,25 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -649,16 +707,25 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -702,16 +769,25 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -755,16 +831,25 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -898,16 +983,25 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,16 +1045,25 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1004,16 +1107,25 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1057,16 +1169,25 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1110,16 +1231,25 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1163,16 +1293,25 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1216,16 +1355,25 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1269,16 +1417,25 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1322,16 +1479,25 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1765,16 +1931,25 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1848,16 +2023,25 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1901,16 +2085,25 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1954,16 +2147,25 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2007,16 +2209,25 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2060,16 +2271,25 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2113,16 +2333,25 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2166,16 +2395,25 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2219,16 +2457,25 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2302,16 +2549,25 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2355,16 +2611,25 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2408,16 +2673,25 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2461,16 +2735,25 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2514,16 +2797,25 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2567,16 +2859,25 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2921,25 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.311, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/anthropic/claude-3-haiku-20240307/b0218eab-984f-4829-90d6-e7fc6f60c530.json b/data/helm_mmlu/anthropic/claude-3-haiku-20240307/f397ca7a-41c4-4926-b075-2523639f0a50.json similarity index 78% rename from data/helm_mmlu/anthropic/claude-3-haiku-20240307/b0218eab-984f-4829-90d6-e7fc6f60c530.json rename to data/helm_mmlu/anthropic/claude-3-haiku-20240307/f397ca7a-41c4-4926-b075-2523639f0a50.json index 9885a79d4..969900aba 100644 --- a/data/helm_mmlu/anthropic/claude-3-haiku-20240307/b0218eab-984f-4829-90d6-e7fc6f60c530.json +++ b/data/helm_mmlu/anthropic/claude-3-haiku-20240307/f397ca7a-41c4-4926-b075-2523639f0a50.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/anthropic_claude-3-haiku-20240307/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/anthropic_claude-3-haiku-20240307/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,16 +9,23 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Claude 3 Haiku (20240307)", + "name": "Claude 3 Haiku 20240307", "id": "anthropic/claude-3-haiku-20240307", "developer": "anthropic", "inference_platform": "unknown" }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -65,132 +69,141 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -234,16 +247,25 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -287,16 +309,25 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -490,16 +521,25 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -543,16 +583,25 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -596,16 +645,25 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -649,16 +707,25 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -702,16 +769,25 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -755,16 +831,25 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -898,16 +983,25 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,16 +1045,25 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1004,16 +1107,25 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1057,16 +1169,25 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1110,16 +1231,25 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1163,16 +1293,25 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1216,16 +1355,25 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1269,16 +1417,25 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1322,16 +1479,25 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1765,16 +1931,25 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1848,16 +2023,25 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1901,16 +2085,25 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1954,16 +2147,25 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2007,16 +2209,25 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2060,16 +2271,25 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2113,16 +2333,25 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2166,16 +2395,25 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2219,16 +2457,25 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2302,16 +2549,25 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2355,16 +2611,25 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2408,16 +2673,25 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2461,16 +2735,25 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2514,16 +2797,25 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2567,16 +2859,25 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2921,25 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.28, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/anthropic/claude-3-opus-20240229/fb4270e9-d4a6-45ea-b47b-d0cf82ea1a2d.json b/data/helm_mmlu/anthropic/claude-3-opus-20240229/acdf4701-e1c2-4867-bd85-d34ae8fb0991.json similarity index 78% rename from data/helm_mmlu/anthropic/claude-3-opus-20240229/fb4270e9-d4a6-45ea-b47b-d0cf82ea1a2d.json rename to data/helm_mmlu/anthropic/claude-3-opus-20240229/acdf4701-e1c2-4867-bd85-d34ae8fb0991.json index ab57a1503..230be4291 100644 --- a/data/helm_mmlu/anthropic/claude-3-opus-20240229/fb4270e9-d4a6-45ea-b47b-d0cf82ea1a2d.json +++ b/data/helm_mmlu/anthropic/claude-3-opus-20240229/acdf4701-e1c2-4867-bd85-d34ae8fb0991.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/anthropic_claude-3-opus-20240229/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/anthropic_claude-3-opus-20240229/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,16 +9,23 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Claude 3 Opus (20240229)", + "name": "Claude 3 Opus 20240229", "id": "anthropic/claude-3-opus-20240229", "developer": "anthropic", "inference_platform": "unknown" }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -65,132 +69,141 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -234,16 +247,25 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -287,16 +309,25 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -490,16 +521,25 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -543,16 +583,25 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -596,16 +645,25 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -649,16 +707,25 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -702,16 +769,25 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -755,16 +831,25 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -898,16 +983,25 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,16 +1045,25 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1004,16 +1107,25 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1057,16 +1169,25 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1110,16 +1231,25 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1163,16 +1293,25 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1216,16 +1355,25 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1269,16 +1417,25 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1322,16 +1479,25 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1765,16 +1931,25 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1848,16 +2023,25 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1901,16 +2085,25 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1954,16 +2147,25 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2007,16 +2209,25 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2060,16 +2271,25 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2113,16 +2333,25 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2166,16 +2395,25 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2219,16 +2457,25 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2302,16 +2549,25 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2355,16 +2611,25 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2408,16 +2673,25 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2461,16 +2735,25 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2514,16 +2797,25 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2567,16 +2859,25 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2921,25 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.014, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/anthropic/claude-3-sonnet-20240229/08d951d1-2912-4a00-99ce-f90340a7fd2a.json b/data/helm_mmlu/anthropic/claude-3-sonnet-20240229/3cd855af-9679-4fd0-bc3f-34db697c7855.json similarity index 78% rename from data/helm_mmlu/anthropic/claude-3-sonnet-20240229/08d951d1-2912-4a00-99ce-f90340a7fd2a.json rename to data/helm_mmlu/anthropic/claude-3-sonnet-20240229/3cd855af-9679-4fd0-bc3f-34db697c7855.json index 710c70a2e..dd7543ecb 100644 --- a/data/helm_mmlu/anthropic/claude-3-sonnet-20240229/08d951d1-2912-4a00-99ce-f90340a7fd2a.json +++ b/data/helm_mmlu/anthropic/claude-3-sonnet-20240229/3cd855af-9679-4fd0-bc3f-34db697c7855.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/anthropic_claude-3-sonnet-20240229/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/anthropic_claude-3-sonnet-20240229/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,16 +9,23 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Claude 3 Sonnet (20240229)", + "name": "Claude 3 Sonnet 20240229", "id": "anthropic/claude-3-sonnet-20240229", "developer": "anthropic", "inference_platform": "unknown" }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -65,132 +69,141 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -234,16 +247,25 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -287,16 +309,25 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -490,16 +521,25 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -543,16 +583,25 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -596,16 +645,25 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -649,16 +707,25 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -702,16 +769,25 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -755,16 +831,25 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -898,16 +983,25 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,16 +1045,25 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1004,16 +1107,25 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1057,16 +1169,25 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1110,16 +1231,25 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1163,16 +1293,25 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1216,16 +1355,25 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1269,16 +1417,25 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1322,16 +1479,25 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1765,16 +1931,25 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1848,16 +2023,25 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1901,16 +2085,25 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1954,16 +2147,25 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2007,16 +2209,25 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2060,16 +2271,25 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2113,16 +2333,25 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2166,16 +2395,25 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2219,16 +2457,25 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2302,16 +2549,25 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2355,16 +2611,25 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2408,16 +2673,25 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2461,16 +2735,25 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2514,16 +2797,25 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2567,16 +2859,25 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2921,25 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.082, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/anthropic/claude-instant-1.2/bfff8f1b-24cc-41b8-b11c-85ee48bef059.json b/data/helm_mmlu/anthropic/claude-instant-1.2/78fb6814-e32f-4b15-b958-9e001637ba07.json similarity index 78% rename from data/helm_mmlu/anthropic/claude-instant-1.2/bfff8f1b-24cc-41b8-b11c-85ee48bef059.json rename to data/helm_mmlu/anthropic/claude-instant-1.2/78fb6814-e32f-4b15-b958-9e001637ba07.json index b632a0864..c9e9779b1 100644 --- a/data/helm_mmlu/anthropic/claude-instant-1.2/bfff8f1b-24cc-41b8-b11c-85ee48bef059.json +++ b/data/helm_mmlu/anthropic/claude-instant-1.2/78fb6814-e32f-4b15-b958-9e001637ba07.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/anthropic_claude-instant-1.2/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/anthropic_claude-instant-1.2/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -19,9 +16,16 @@ }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -65,132 +69,141 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -234,16 +247,25 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -287,16 +309,25 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -490,16 +521,25 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -543,16 +583,25 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -596,16 +645,25 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -649,16 +707,25 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -702,16 +769,25 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -755,16 +831,25 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -898,16 +983,25 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,16 +1045,25 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1004,16 +1107,25 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1057,16 +1169,25 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1110,16 +1231,25 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1163,16 +1293,25 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1216,16 +1355,25 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1269,16 +1417,25 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1322,16 +1479,25 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1765,16 +1931,25 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1848,16 +2023,25 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1901,16 +2085,25 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1954,16 +2147,25 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2007,16 +2209,25 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2060,16 +2271,25 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2113,16 +2333,25 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2166,16 +2395,25 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2219,16 +2457,25 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2302,16 +2549,25 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2355,16 +2611,25 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2408,16 +2673,25 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2461,16 +2735,25 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2514,16 +2797,25 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2567,16 +2859,25 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2921,25 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.186, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/cohere/command-r-plus/f1509273-dea1-477e-bf04-02767838c1f9.json b/data/helm_mmlu/cohere/command-r-plus/f3bccdeb-88a2-46ce-bfc9-5d5c3a7e8708.json similarity index 78% rename from data/helm_mmlu/cohere/command-r-plus/f1509273-dea1-477e-bf04-02767838c1f9.json rename to data/helm_mmlu/cohere/command-r-plus/f3bccdeb-88a2-46ce-bfc9-5d5c3a7e8708.json index 6ef0cc597..6bebd236d 100644 --- a/data/helm_mmlu/cohere/command-r-plus/f1509273-dea1-477e-bf04-02767838c1f9.json +++ b/data/helm_mmlu/cohere/command-r-plus/f3bccdeb-88a2-46ce-bfc9-5d5c3a7e8708.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/cohere_command-r-plus/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/cohere_command-r-plus/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -19,9 +16,16 @@ }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -65,132 +69,141 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -234,16 +247,25 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -287,16 +309,25 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -490,16 +521,25 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -543,16 +583,25 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -596,16 +645,25 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -649,16 +707,25 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -702,16 +769,25 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -755,16 +831,25 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -898,16 +983,25 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,16 +1045,25 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1004,16 +1107,25 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1057,16 +1169,25 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1110,16 +1231,25 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1163,16 +1293,25 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1216,16 +1355,25 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1269,16 +1417,25 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1322,16 +1479,25 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1765,16 +1931,25 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1848,16 +2023,25 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1901,16 +2085,25 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1954,16 +2147,25 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2007,16 +2209,25 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2060,16 +2271,25 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2113,16 +2333,25 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2166,16 +2395,25 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2219,16 +2457,25 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2302,16 +2549,25 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2355,16 +2611,25 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2408,16 +2673,25 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2461,16 +2735,25 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2514,16 +2797,25 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2567,16 +2859,25 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2921,25 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.825, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/cohere/command-r/45524eef-0678-47db-8620-a5f55e166e63.json b/data/helm_mmlu/cohere/command-r/cefc3b25-0779-4fb3-93a5-3c7a285304af.json similarity index 78% rename from data/helm_mmlu/cohere/command-r/45524eef-0678-47db-8620-a5f55e166e63.json rename to data/helm_mmlu/cohere/command-r/cefc3b25-0779-4fb3-93a5-3c7a285304af.json index 6fa172bf8..e82639d82 100644 --- a/data/helm_mmlu/cohere/command-r/45524eef-0678-47db-8620-a5f55e166e63.json +++ b/data/helm_mmlu/cohere/command-r/cefc3b25-0779-4fb3-93a5-3c7a285304af.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/cohere_command-r/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/cohere_command-r/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -19,9 +16,16 @@ }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -65,132 +69,141 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -234,16 +247,25 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -287,16 +309,25 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -490,16 +521,25 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -543,16 +583,25 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -596,16 +645,25 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -649,16 +707,25 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -702,16 +769,25 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -755,16 +831,25 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -898,16 +983,25 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,16 +1045,25 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1004,16 +1107,25 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1057,16 +1169,25 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1110,16 +1231,25 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1163,16 +1293,25 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1216,16 +1355,25 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1269,16 +1417,25 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1322,16 +1479,25 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1765,16 +1931,25 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1848,16 +2023,25 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1901,16 +2085,25 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1954,16 +2147,25 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2007,16 +2209,25 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2060,16 +2271,25 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2113,16 +2333,25 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2166,16 +2395,25 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2219,16 +2457,25 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2302,16 +2549,25 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2355,16 +2611,25 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2408,16 +2673,25 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2461,16 +2735,25 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2514,16 +2797,25 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2567,16 +2859,25 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2921,25 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.959, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/databricks/dbrx-instruct/cd2371e9-e552-4944-bc30-c2269c960e16.json b/data/helm_mmlu/databricks/dbrx-instruct/7e00e082-0e79-45e0-b0ff-5458cc2aff85.json similarity index 78% rename from data/helm_mmlu/databricks/dbrx-instruct/cd2371e9-e552-4944-bc30-c2269c960e16.json rename to data/helm_mmlu/databricks/dbrx-instruct/7e00e082-0e79-45e0-b0ff-5458cc2aff85.json index 8d0b57f82..d5f73b61f 100644 --- a/data/helm_mmlu/databricks/dbrx-instruct/cd2371e9-e552-4944-bc30-c2269c960e16.json +++ b/data/helm_mmlu/databricks/dbrx-instruct/7e00e082-0e79-45e0-b0ff-5458cc2aff85.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/databricks_dbrx-instruct/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/databricks_dbrx-instruct/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -19,9 +16,16 @@ }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -65,132 +69,141 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -234,16 +247,25 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -287,16 +309,25 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -490,16 +521,25 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -543,16 +583,25 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -596,16 +645,25 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -649,16 +707,25 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -702,16 +769,25 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -755,16 +831,25 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -898,16 +983,25 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,16 +1045,25 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1004,16 +1107,25 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1057,16 +1169,25 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1110,16 +1231,25 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1163,16 +1293,25 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1216,16 +1355,25 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1269,16 +1417,25 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1322,16 +1479,25 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1765,16 +1931,25 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1848,16 +2023,25 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1901,16 +2085,25 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1954,16 +2147,25 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2007,16 +2209,25 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2060,16 +2271,25 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2113,16 +2333,25 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2166,16 +2395,25 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2219,16 +2457,25 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2302,16 +2549,25 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2355,16 +2611,25 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2408,16 +2673,25 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2461,16 +2735,25 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2514,16 +2797,25 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2567,16 +2859,25 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2921,25 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.537, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/deepseek-ai/deepseek-llm-67b-chat/7378a9f3-28ad-475c-bdb0-b282f8f52e4e.json b/data/helm_mmlu/deepseek-ai/deepseek-llm-67b-chat/ee5528b4-b4a5-423f-8149-6c1dc4d2096d.json similarity index 78% rename from data/helm_mmlu/deepseek-ai/deepseek-llm-67b-chat/7378a9f3-28ad-475c-bdb0-b282f8f52e4e.json rename to data/helm_mmlu/deepseek-ai/deepseek-llm-67b-chat/ee5528b4-b4a5-423f-8149-6c1dc4d2096d.json index 7837e5696..7ec071041 100644 --- a/data/helm_mmlu/deepseek-ai/deepseek-llm-67b-chat/7378a9f3-28ad-475c-bdb0-b282f8f52e4e.json +++ b/data/helm_mmlu/deepseek-ai/deepseek-llm-67b-chat/ee5528b4-b4a5-423f-8149-6c1dc4d2096d.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/deepseek-ai_deepseek-llm-67b-chat/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/deepseek-ai_deepseek-llm-67b-chat/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,16 +9,23 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "DeepSeek LLM Chat (67B)", + "name": "DeepSeek LLM Chat 67B", "id": "deepseek-ai/deepseek-llm-67b-chat", "developer": "deepseek-ai", "inference_platform": "unknown" }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -65,132 +69,141 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -234,16 +247,25 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -287,16 +309,25 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -490,16 +521,25 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -543,16 +583,25 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -596,16 +645,25 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -649,16 +707,25 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -702,16 +769,25 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -755,16 +831,25 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -898,16 +983,25 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,16 +1045,25 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1004,16 +1107,25 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1057,16 +1169,25 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1110,16 +1231,25 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1163,16 +1293,25 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1216,16 +1355,25 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1269,16 +1417,25 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1322,16 +1479,25 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1765,16 +1931,25 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1848,16 +2023,25 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1901,16 +2085,25 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1954,16 +2147,25 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2007,16 +2209,25 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2060,16 +2271,25 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2113,16 +2333,25 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2166,16 +2395,25 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2219,16 +2457,25 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2302,16 +2549,25 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2355,16 +2611,25 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2408,16 +2673,25 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2461,16 +2735,25 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2514,16 +2797,25 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2567,16 +2859,25 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2921,25 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.387, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/deepseek-ai/deepseek-v3/87716ef9-56bb-4737-b578-9e53742c714a.json b/data/helm_mmlu/deepseek-ai/deepseek-v3/c97b0f33-eda0-4069-9ab6-f277c1f8e55b.json similarity index 78% rename from data/helm_mmlu/deepseek-ai/deepseek-v3/87716ef9-56bb-4737-b578-9e53742c714a.json rename to data/helm_mmlu/deepseek-ai/deepseek-v3/c97b0f33-eda0-4069-9ab6-f277c1f8e55b.json index b9d5d50e7..200a6e19c 100644 --- a/data/helm_mmlu/deepseek-ai/deepseek-v3/87716ef9-56bb-4737-b578-9e53742c714a.json +++ b/data/helm_mmlu/deepseek-ai/deepseek-v3/c97b0f33-eda0-4069-9ab6-f277c1f8e55b.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/deepseek-ai_deepseek-v3/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/deepseek-ai_deepseek-v3/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -19,9 +16,16 @@ }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -65,132 +69,141 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -234,16 +247,25 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -287,16 +309,25 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -490,16 +521,25 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -543,16 +583,25 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -596,16 +645,25 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -649,16 +707,25 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -702,16 +769,25 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -755,16 +831,25 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -898,16 +983,25 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,16 +1045,25 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1004,16 +1107,25 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1057,16 +1169,25 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1110,16 +1231,25 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1163,16 +1293,25 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1216,16 +1355,25 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1269,16 +1417,25 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1322,16 +1479,25 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1765,16 +1931,25 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1848,16 +2023,25 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1901,16 +2085,25 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1954,16 +2147,25 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2007,16 +2209,25 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2060,16 +2271,25 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2113,16 +2333,25 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2166,16 +2395,25 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2219,16 +2457,25 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2302,16 +2549,25 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2355,16 +2611,25 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2408,16 +2673,25 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2461,16 +2735,25 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2514,16 +2797,25 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2567,16 +2859,25 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2921,25 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.215, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/google/gemini-1.0-pro-001/8a60d74d-0a32-4aab-9bb9-c12e01a08c2b.json b/data/helm_mmlu/google/gemini-1.0-pro-001/7ea5b404-d98f-4282-81d8-6ca5f6629429.json similarity index 78% rename from data/helm_mmlu/google/gemini-1.0-pro-001/8a60d74d-0a32-4aab-9bb9-c12e01a08c2b.json rename to data/helm_mmlu/google/gemini-1.0-pro-001/7ea5b404-d98f-4282-81d8-6ca5f6629429.json index 4fb164090..86096274a 100644 --- a/data/helm_mmlu/google/gemini-1.0-pro-001/8a60d74d-0a32-4aab-9bb9-c12e01a08c2b.json +++ b/data/helm_mmlu/google/gemini-1.0-pro-001/7ea5b404-d98f-4282-81d8-6ca5f6629429.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/google_gemini-1.0-pro-001/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/google_gemini-1.0-pro-001/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,16 +9,23 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Gemini 1.0 Pro (001)", + "name": "Gemini 1.0 Pro 001", "id": "google/gemini-1.0-pro-001", "developer": "google", "inference_platform": "unknown" }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -65,132 +69,141 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -234,16 +247,25 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -287,16 +309,25 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -490,16 +521,25 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -543,16 +583,25 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -596,16 +645,25 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -649,16 +707,25 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -702,16 +769,25 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -755,16 +831,25 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -898,16 +983,25 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,16 +1045,25 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1004,16 +1107,25 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1057,16 +1169,25 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1110,16 +1231,25 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1163,16 +1293,25 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1216,16 +1355,25 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1269,16 +1417,25 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1322,16 +1479,25 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1765,16 +1931,25 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1848,16 +2023,25 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1901,16 +2085,25 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1954,16 +2147,25 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2007,16 +2209,25 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2060,16 +2271,25 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2113,16 +2333,25 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2166,16 +2395,25 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2219,16 +2457,25 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2302,16 +2549,25 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2355,16 +2611,25 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2408,16 +2673,25 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2461,16 +2735,25 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2514,16 +2797,25 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2567,16 +2859,25 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2921,25 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.677, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/google/gemini-1.5-flash-001/ff7e3c87-0c6a-4095-b83a-0fba5468d26d.json b/data/helm_mmlu/google/gemini-1.5-flash-001/7056c7e7-f68a-4764-aa48-a8368ae2e317.json similarity index 78% rename from data/helm_mmlu/google/gemini-1.5-flash-001/ff7e3c87-0c6a-4095-b83a-0fba5468d26d.json rename to data/helm_mmlu/google/gemini-1.5-flash-001/7056c7e7-f68a-4764-aa48-a8368ae2e317.json index a91e47447..7aac2d734 100644 --- a/data/helm_mmlu/google/gemini-1.5-flash-001/ff7e3c87-0c6a-4095-b83a-0fba5468d26d.json +++ b/data/helm_mmlu/google/gemini-1.5-flash-001/7056c7e7-f68a-4764-aa48-a8368ae2e317.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/google_gemini-1.5-flash-001/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/google_gemini-1.5-flash-001/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,16 +9,23 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Gemini 1.5 Flash (001)", + "name": "Gemini 1.5 Flash 001", "id": "google/gemini-1.5-flash-001", "developer": "google", "inference_platform": "unknown" }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -65,132 +69,141 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -234,16 +247,25 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -287,16 +309,25 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -490,16 +521,25 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -543,16 +583,25 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -596,16 +645,25 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -649,16 +707,25 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -702,16 +769,25 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -755,16 +831,25 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -898,16 +983,25 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,16 +1045,25 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1004,16 +1107,25 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1057,16 +1169,25 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1110,16 +1231,25 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1163,16 +1293,25 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1216,16 +1355,25 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1269,16 +1417,25 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1322,16 +1479,25 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1765,16 +1931,25 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1848,16 +2023,25 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1901,16 +2085,25 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1954,16 +2147,25 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2007,16 +2209,25 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2060,16 +2271,25 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2113,16 +2333,25 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2166,16 +2395,25 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2219,16 +2457,25 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2302,16 +2549,25 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2355,16 +2611,25 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2408,16 +2673,25 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2461,16 +2735,25 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2514,16 +2797,25 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2567,16 +2859,25 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2921,25 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.47, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/google/gemini-1.5-flash-002/ec78481a-0b0d-4709-99ea-6423372d6038.json b/data/helm_mmlu/google/gemini-1.5-flash-002/5e67014d-6ca1-4e65-a85a-84d91e147d4d.json similarity index 78% rename from data/helm_mmlu/google/gemini-1.5-flash-002/ec78481a-0b0d-4709-99ea-6423372d6038.json rename to data/helm_mmlu/google/gemini-1.5-flash-002/5e67014d-6ca1-4e65-a85a-84d91e147d4d.json index c8a9b1912..a87c94c3b 100644 --- a/data/helm_mmlu/google/gemini-1.5-flash-002/ec78481a-0b0d-4709-99ea-6423372d6038.json +++ b/data/helm_mmlu/google/gemini-1.5-flash-002/5e67014d-6ca1-4e65-a85a-84d91e147d4d.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/google_gemini-1.5-flash-002/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/google_gemini-1.5-flash-002/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,16 +9,23 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Gemini 1.5 Flash (002)", + "name": "Gemini 1.5 Flash 002", "id": "google/gemini-1.5-flash-002", "developer": "google", "inference_platform": "unknown" }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -65,132 +69,141 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -234,16 +247,25 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -287,16 +309,25 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -490,16 +521,25 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -543,16 +583,25 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -596,16 +645,25 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -649,16 +707,25 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -702,16 +769,25 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -755,16 +831,25 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -898,16 +983,25 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,16 +1045,25 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1004,16 +1107,25 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1057,16 +1169,25 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1110,16 +1231,25 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1163,16 +1293,25 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1216,16 +1355,25 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1269,16 +1417,25 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1322,16 +1479,25 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1765,16 +1931,25 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1848,16 +2023,25 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1901,16 +2085,25 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1954,16 +2147,25 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2007,16 +2209,25 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2060,16 +2271,25 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2113,16 +2333,25 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2166,16 +2395,25 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2219,16 +2457,25 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2302,16 +2549,25 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2355,16 +2611,25 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2408,16 +2673,25 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2461,16 +2735,25 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2514,16 +2797,25 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2567,16 +2859,25 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2921,25 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.817, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/google/gemini-1.5-flash-preview-0514/2a8845b3-cdbc-409c-8346-f83fb607999a.json b/data/helm_mmlu/google/gemini-1.5-flash-preview-0514/3e82f5a5-b80a-4f2f-a262-43c6ee50fbf8.json similarity index 78% rename from data/helm_mmlu/google/gemini-1.5-flash-preview-0514/2a8845b3-cdbc-409c-8346-f83fb607999a.json rename to data/helm_mmlu/google/gemini-1.5-flash-preview-0514/3e82f5a5-b80a-4f2f-a262-43c6ee50fbf8.json index ffdf7910d..b8d59d877 100644 --- a/data/helm_mmlu/google/gemini-1.5-flash-preview-0514/2a8845b3-cdbc-409c-8346-f83fb607999a.json +++ b/data/helm_mmlu/google/gemini-1.5-flash-preview-0514/3e82f5a5-b80a-4f2f-a262-43c6ee50fbf8.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/google_gemini-1.5-flash-preview-0514/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/google_gemini-1.5-flash-preview-0514/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,16 +9,23 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Gemini 1.5 Flash (0514 preview)", + "name": "Gemini 1.5 Flash 0514 preview", "id": "google/gemini-1.5-flash-preview-0514", "developer": "google", "inference_platform": "unknown" }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -65,132 +69,141 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -234,16 +247,25 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -287,16 +309,25 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -490,16 +521,25 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -543,16 +583,25 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -596,16 +645,25 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -649,16 +707,25 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -702,16 +769,25 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -755,16 +831,25 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -898,16 +983,25 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,16 +1045,25 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1004,16 +1107,25 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1057,16 +1169,25 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1110,16 +1231,25 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1163,16 +1293,25 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1216,16 +1355,25 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1269,16 +1417,25 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1322,16 +1479,25 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1765,16 +1931,25 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1848,16 +2023,25 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1901,16 +2085,25 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1954,16 +2147,25 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2007,16 +2209,25 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2060,16 +2271,25 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2113,16 +2333,25 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2166,16 +2395,25 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2219,16 +2457,25 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2302,16 +2549,25 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2355,16 +2611,25 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2408,16 +2673,25 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2461,16 +2735,25 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2514,16 +2797,25 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2567,16 +2859,25 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2921,25 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.713, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/google/gemini-1.5-pro-001/486b6479-f327-43ab-af2c-8824abaf5fe6.json b/data/helm_mmlu/google/gemini-1.5-pro-001/46d5e547-507e-4c98-98a9-bad1bfad7f7b.json similarity index 78% rename from data/helm_mmlu/google/gemini-1.5-pro-001/486b6479-f327-43ab-af2c-8824abaf5fe6.json rename to data/helm_mmlu/google/gemini-1.5-pro-001/46d5e547-507e-4c98-98a9-bad1bfad7f7b.json index 0115a3fa0..0632aee68 100644 --- a/data/helm_mmlu/google/gemini-1.5-pro-001/486b6479-f327-43ab-af2c-8824abaf5fe6.json +++ b/data/helm_mmlu/google/gemini-1.5-pro-001/46d5e547-507e-4c98-98a9-bad1bfad7f7b.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/google_gemini-1.5-pro-001/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/google_gemini-1.5-pro-001/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,16 +9,23 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Gemini 1.5 Pro (001)", + "name": "Gemini 1.5 Pro 001", "id": "google/gemini-1.5-pro-001", "developer": "google", "inference_platform": "unknown" }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -65,132 +69,141 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -234,16 +247,25 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -287,16 +309,25 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -490,16 +521,25 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -543,16 +583,25 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -596,16 +645,25 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -649,16 +707,25 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -702,16 +769,25 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -755,16 +831,25 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -898,16 +983,25 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,16 +1045,25 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1004,16 +1107,25 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1057,16 +1169,25 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1110,16 +1231,25 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1163,16 +1293,25 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1216,16 +1355,25 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1269,16 +1417,25 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1322,16 +1479,25 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1765,16 +1931,25 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1848,16 +2023,25 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1901,16 +2085,25 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1954,16 +2147,25 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2007,16 +2209,25 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2060,16 +2271,25 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2113,16 +2333,25 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2166,16 +2395,25 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2219,16 +2457,25 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2302,16 +2549,25 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2355,16 +2611,25 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2408,16 +2673,25 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2461,16 +2735,25 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2514,16 +2797,25 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2567,16 +2859,25 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2921,25 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.349, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/google/gemini-1.5-pro-002/4ea206d4-961a-4fc8-824e-b5b8c0f3a36e.json b/data/helm_mmlu/google/gemini-1.5-pro-002/ce32874c-ceb9-4e6b-96bc-ff56cb99be5d.json similarity index 78% rename from data/helm_mmlu/google/gemini-1.5-pro-002/4ea206d4-961a-4fc8-824e-b5b8c0f3a36e.json rename to data/helm_mmlu/google/gemini-1.5-pro-002/ce32874c-ceb9-4e6b-96bc-ff56cb99be5d.json index 1c57dbb48..d6a3ba87a 100644 --- a/data/helm_mmlu/google/gemini-1.5-pro-002/4ea206d4-961a-4fc8-824e-b5b8c0f3a36e.json +++ b/data/helm_mmlu/google/gemini-1.5-pro-002/ce32874c-ceb9-4e6b-96bc-ff56cb99be5d.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/google_gemini-1.5-pro-002/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/google_gemini-1.5-pro-002/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,16 +9,23 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Gemini 1.5 Pro (002)", + "name": "Gemini 1.5 Pro 002", "id": "google/gemini-1.5-pro-002", "developer": "google", "inference_platform": "unknown" }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -65,132 +69,141 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -234,16 +247,25 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -287,16 +309,25 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -490,16 +521,25 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -543,16 +583,25 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -596,16 +645,25 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -649,16 +707,25 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -702,16 +769,25 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -755,16 +831,25 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -898,16 +983,25 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,16 +1045,25 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1004,16 +1107,25 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1057,16 +1169,25 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1110,16 +1231,25 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1163,16 +1293,25 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1216,16 +1355,25 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1269,16 +1417,25 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1322,16 +1479,25 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1765,16 +1931,25 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1848,16 +2023,25 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1901,16 +2085,25 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1954,16 +2147,25 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2007,16 +2209,25 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2060,16 +2271,25 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2113,16 +2333,25 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2166,16 +2395,25 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2219,16 +2457,25 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2302,16 +2549,25 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2355,16 +2611,25 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2408,16 +2673,25 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2461,16 +2735,25 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2514,16 +2797,25 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2567,16 +2859,25 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2921,25 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.334, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/google/gemini-1.5-pro-preview-0409/bedeefc9-8e78-4ce9-9883-b222df8e3ef7.json b/data/helm_mmlu/google/gemini-1.5-pro-preview-0409/2b31b441-caa9-465c-a2d2-051c951c7be3.json similarity index 78% rename from data/helm_mmlu/google/gemini-1.5-pro-preview-0409/bedeefc9-8e78-4ce9-9883-b222df8e3ef7.json rename to data/helm_mmlu/google/gemini-1.5-pro-preview-0409/2b31b441-caa9-465c-a2d2-051c951c7be3.json index 065435cc3..de3a77c03 100644 --- a/data/helm_mmlu/google/gemini-1.5-pro-preview-0409/bedeefc9-8e78-4ce9-9883-b222df8e3ef7.json +++ b/data/helm_mmlu/google/gemini-1.5-pro-preview-0409/2b31b441-caa9-465c-a2d2-051c951c7be3.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/google_gemini-1.5-pro-preview-0409/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/google_gemini-1.5-pro-preview-0409/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,16 +9,23 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Gemini 1.5 Pro (0409 preview)", + "name": "Gemini 1.5 Pro 0409 preview", "id": "google/gemini-1.5-pro-preview-0409", "developer": "google", "inference_platform": "unknown" }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -65,132 +69,141 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -234,16 +247,25 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -287,16 +309,25 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -490,16 +521,25 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -543,16 +583,25 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -596,16 +645,25 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -649,16 +707,25 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -702,16 +769,25 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -755,16 +831,25 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -898,16 +983,25 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,16 +1045,25 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1004,16 +1107,25 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1057,16 +1169,25 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1110,16 +1231,25 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1163,16 +1293,25 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1216,16 +1355,25 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1269,16 +1417,25 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1322,16 +1479,25 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1765,16 +1931,25 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1848,16 +2023,25 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1901,16 +2085,25 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1954,16 +2147,25 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2007,16 +2209,25 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2060,16 +2271,25 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2113,16 +2333,25 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2166,16 +2395,25 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2219,16 +2457,25 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2302,16 +2549,25 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2355,16 +2611,25 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2408,16 +2673,25 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2461,16 +2735,25 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2514,16 +2797,25 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2567,16 +2859,25 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2921,25 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.118, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/google/gemini-2.0-flash-exp/0837a2fd-1f25-4133-9ce6-b8ca29830f70.json b/data/helm_mmlu/google/gemini-2.0-flash-exp/b7ea6c93-af70-4c0f-ba50-03a539416a8b.json similarity index 78% rename from data/helm_mmlu/google/gemini-2.0-flash-exp/0837a2fd-1f25-4133-9ce6-b8ca29830f70.json rename to data/helm_mmlu/google/gemini-2.0-flash-exp/b7ea6c93-af70-4c0f-ba50-03a539416a8b.json index 9b4101c21..6b53de064 100644 --- a/data/helm_mmlu/google/gemini-2.0-flash-exp/0837a2fd-1f25-4133-9ce6-b8ca29830f70.json +++ b/data/helm_mmlu/google/gemini-2.0-flash-exp/b7ea6c93-af70-4c0f-ba50-03a539416a8b.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/google_gemini-2.0-flash-exp/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/google_gemini-2.0-flash-exp/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,16 +9,23 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Gemini 2.0 Flash (Experimental)", + "name": "Gemini 2.0 Flash Experimental", "id": "google/gemini-2.0-flash-exp", "developer": "google", "inference_platform": "unknown" }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -65,132 +69,141 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -234,16 +247,25 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -287,16 +309,25 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -490,16 +521,25 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -543,16 +583,25 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -596,16 +645,25 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -649,16 +707,25 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -702,16 +769,25 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -755,16 +831,25 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -898,16 +983,25 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,16 +1045,25 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1004,16 +1107,25 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1057,16 +1169,25 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1110,16 +1231,25 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1163,16 +1293,25 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1216,16 +1355,25 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1269,16 +1417,25 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1322,16 +1479,25 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1765,16 +1931,25 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1848,16 +2023,25 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1901,16 +2085,25 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1954,16 +2147,25 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2007,16 +2209,25 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2060,16 +2271,25 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2113,16 +2333,25 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2166,16 +2395,25 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2219,16 +2457,25 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2302,16 +2549,25 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2355,16 +2611,25 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2408,16 +2673,25 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2461,16 +2735,25 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2514,16 +2797,25 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2567,16 +2859,25 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2921,25 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.567, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/google/gemma-2-27b/b732e4c3-526e-42b3-8003-defe6f99dec5.json b/data/helm_mmlu/google/gemma-2-27b/fe4cec30-e483-49a8-80ea-00b2c6231740.json similarity index 78% rename from data/helm_mmlu/google/gemma-2-27b/b732e4c3-526e-42b3-8003-defe6f99dec5.json rename to data/helm_mmlu/google/gemma-2-27b/fe4cec30-e483-49a8-80ea-00b2c6231740.json index 2a0eccbe5..8720cc062 100644 --- a/data/helm_mmlu/google/gemma-2-27b/b732e4c3-526e-42b3-8003-defe6f99dec5.json +++ b/data/helm_mmlu/google/gemma-2-27b/fe4cec30-e483-49a8-80ea-00b2c6231740.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/google_gemma-2-27b/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/google_gemma-2-27b/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,16 +9,23 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Gemma 2 (27B)", + "name": "Gemma 2 27B", "id": "google/gemma-2-27b", "developer": "google", "inference_platform": "unknown" }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -65,132 +69,141 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -234,16 +247,25 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -287,16 +309,25 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -490,16 +521,25 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -543,16 +583,25 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -596,16 +645,25 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -649,16 +707,25 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -702,16 +769,25 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -755,16 +831,25 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -898,16 +983,25 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,16 +1045,25 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1004,16 +1107,25 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1057,16 +1169,25 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1110,16 +1231,25 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1163,16 +1293,25 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1216,16 +1355,25 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1269,16 +1417,25 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1322,16 +1479,25 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1765,16 +1931,25 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1848,16 +2023,25 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1901,16 +2085,25 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1954,16 +2147,25 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2007,16 +2209,25 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2060,16 +2271,25 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2113,16 +2333,25 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2166,16 +2395,25 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2219,16 +2457,25 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2302,16 +2549,25 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2355,16 +2611,25 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2408,16 +2673,25 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2461,16 +2735,25 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2514,16 +2797,25 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2567,16 +2859,25 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2921,25 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.05, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/google/gemma-2-9b/72c70a52-df3d-48b4-bd2d-3161f1a4cf6b.json b/data/helm_mmlu/google/gemma-2-9b/53fe520f-4dbc-436a-b9d6-4a5067c30ebd.json similarity index 78% rename from data/helm_mmlu/google/gemma-2-9b/72c70a52-df3d-48b4-bd2d-3161f1a4cf6b.json rename to data/helm_mmlu/google/gemma-2-9b/53fe520f-4dbc-436a-b9d6-4a5067c30ebd.json index 7b83a32f9..2007b06df 100644 --- a/data/helm_mmlu/google/gemma-2-9b/72c70a52-df3d-48b4-bd2d-3161f1a4cf6b.json +++ b/data/helm_mmlu/google/gemma-2-9b/53fe520f-4dbc-436a-b9d6-4a5067c30ebd.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/google_gemma-2-9b/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/google_gemma-2-9b/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,16 +9,23 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Gemma 2 (9B)", + "name": "Gemma 2 9B", "id": "google/gemma-2-9b", "developer": "google", "inference_platform": "unknown" }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -65,132 +69,141 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -234,16 +247,25 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -287,16 +309,25 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -490,16 +521,25 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -543,16 +583,25 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -596,16 +645,25 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -649,16 +707,25 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -702,16 +769,25 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -755,16 +831,25 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -898,16 +983,25 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,16 +1045,25 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1004,16 +1107,25 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1057,16 +1169,25 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1110,16 +1231,25 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1163,16 +1293,25 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1216,16 +1355,25 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1269,16 +1417,25 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1322,16 +1479,25 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1765,16 +1931,25 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1848,16 +2023,25 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1901,16 +2085,25 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1954,16 +2147,25 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2007,16 +2209,25 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2060,16 +2271,25 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2113,16 +2333,25 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2166,16 +2395,25 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2219,16 +2457,25 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2302,16 +2549,25 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2355,16 +2611,25 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2408,16 +2673,25 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2461,16 +2735,25 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2514,16 +2797,25 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2567,16 +2859,25 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2921,25 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.265, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/google/gemma-7b/11b66d50-28d9-42bc-8f91-463b02fa96f7.json b/data/helm_mmlu/google/gemma-7b/af88b02d-cb29-4d2c-bb33-5fddcf316a95.json similarity index 78% rename from data/helm_mmlu/google/gemma-7b/11b66d50-28d9-42bc-8f91-463b02fa96f7.json rename to data/helm_mmlu/google/gemma-7b/af88b02d-cb29-4d2c-bb33-5fddcf316a95.json index 1480d9d56..963d13c9a 100644 --- a/data/helm_mmlu/google/gemma-7b/11b66d50-28d9-42bc-8f91-463b02fa96f7.json +++ b/data/helm_mmlu/google/gemma-7b/af88b02d-cb29-4d2c-bb33-5fddcf316a95.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/google_gemma-7b/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/google_gemma-7b/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,16 +9,23 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Gemma (7B)", + "name": "Gemma 7B", "id": "google/gemma-7b", "developer": "google", "inference_platform": "unknown" }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -65,132 +69,141 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -234,16 +247,25 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -287,16 +309,25 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -490,16 +521,25 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -543,16 +583,25 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -596,16 +645,25 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -649,16 +707,25 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -702,16 +769,25 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -755,16 +831,25 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -898,16 +983,25 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,16 +1045,25 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1004,16 +1107,25 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1057,16 +1169,25 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1110,16 +1231,25 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1163,16 +1293,25 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1216,16 +1355,25 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1269,16 +1417,25 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1322,16 +1479,25 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1765,16 +1931,25 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1848,16 +2023,25 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1901,16 +2085,25 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1954,16 +2147,25 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2007,16 +2209,25 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2060,16 +2271,25 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2113,16 +2333,25 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2166,16 +2395,25 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2219,16 +2457,25 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2302,16 +2549,25 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2355,16 +2611,25 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2408,16 +2673,25 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2461,16 +2735,25 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2514,16 +2797,25 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2567,16 +2859,25 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2921,25 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.824, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/google/text-bison@001/70210df9-1fb2-4fdd-b6eb-0d0aec88992e.json b/data/helm_mmlu/google/text-bison@001/a0abcd19-58a1-478a-9786-d044a4181241.json similarity index 78% rename from data/helm_mmlu/google/text-bison@001/70210df9-1fb2-4fdd-b6eb-0d0aec88992e.json rename to data/helm_mmlu/google/text-bison@001/a0abcd19-58a1-478a-9786-d044a4181241.json index a20b853b7..c0271bcb3 100644 --- a/data/helm_mmlu/google/text-bison@001/70210df9-1fb2-4fdd-b6eb-0d0aec88992e.json +++ b/data/helm_mmlu/google/text-bison@001/a0abcd19-58a1-478a-9786-d044a4181241.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/google_text-bison@001/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/google_text-bison@001/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,16 +9,23 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "PaLM-2 (Bison)", + "name": "PaLM-2 Bison", "id": "google/text-bison@001", "developer": "google", "inference_platform": "unknown" }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -65,132 +69,141 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -234,16 +247,25 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -287,16 +309,25 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -490,16 +521,25 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -543,16 +583,25 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -596,16 +645,25 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -649,16 +707,25 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -702,16 +769,25 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -755,16 +831,25 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -898,16 +983,25 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,16 +1045,25 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1004,16 +1107,25 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1057,16 +1169,25 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1110,16 +1231,25 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1163,16 +1293,25 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1216,16 +1355,25 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1269,16 +1417,25 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1322,16 +1479,25 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1765,16 +1931,25 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1848,16 +2023,25 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1901,16 +2085,25 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1954,16 +2147,25 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2007,16 +2209,25 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2060,16 +2271,25 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2113,16 +2333,25 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2166,16 +2395,25 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2219,16 +2457,25 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2302,16 +2549,25 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2355,16 +2611,25 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2408,16 +2673,25 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2461,16 +2735,25 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2514,16 +2797,25 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2567,16 +2859,25 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2921,25 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.192, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/google/text-unicorn@001/c2e53d3a-b85c-4888-8b20-225db39301ab.json b/data/helm_mmlu/google/text-unicorn@001/95eda13a-cd34-4170-b2db-f2ead47250f9.json similarity index 78% rename from data/helm_mmlu/google/text-unicorn@001/c2e53d3a-b85c-4888-8b20-225db39301ab.json rename to data/helm_mmlu/google/text-unicorn@001/95eda13a-cd34-4170-b2db-f2ead47250f9.json index 061cfda40..42c5040aa 100644 --- a/data/helm_mmlu/google/text-unicorn@001/c2e53d3a-b85c-4888-8b20-225db39301ab.json +++ b/data/helm_mmlu/google/text-unicorn@001/95eda13a-cd34-4170-b2db-f2ead47250f9.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/google_text-unicorn@001/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/google_text-unicorn@001/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,16 +9,23 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "PaLM-2 (Unicorn)", + "name": "PaLM-2 Unicorn", "id": "google/text-unicorn@001", "developer": "google", "inference_platform": "unknown" }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -65,132 +69,141 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -234,16 +247,25 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -287,16 +309,25 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -490,16 +521,25 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -543,16 +583,25 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -596,16 +645,25 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -649,16 +707,25 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -702,16 +769,25 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -755,16 +831,25 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -898,16 +983,25 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,16 +1045,25 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1004,16 +1107,25 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1057,16 +1169,25 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1110,16 +1231,25 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1163,16 +1293,25 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1216,16 +1355,25 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1269,16 +1417,25 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1322,16 +1479,25 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1765,16 +1931,25 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1848,16 +2023,25 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1901,16 +2085,25 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1954,16 +2147,25 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2007,16 +2209,25 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2060,16 +2271,25 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2113,16 +2333,25 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2166,16 +2395,25 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2219,16 +2457,25 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2302,16 +2549,25 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2355,16 +2611,25 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2408,16 +2673,25 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2461,16 +2735,25 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2514,16 +2797,25 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2567,16 +2859,25 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2921,25 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.142, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/meta/llama-2-13b/a477c332-b082-4ad5-8d2f-905690e9d211.json b/data/helm_mmlu/meta/llama-2-13b/7f37161a-3f1c-4bc4-860f-8fdbf623f63e.json similarity index 78% rename from data/helm_mmlu/meta/llama-2-13b/a477c332-b082-4ad5-8d2f-905690e9d211.json rename to data/helm_mmlu/meta/llama-2-13b/7f37161a-3f1c-4bc4-860f-8fdbf623f63e.json index 999bc7bce..453cd8b3a 100644 --- a/data/helm_mmlu/meta/llama-2-13b/a477c332-b082-4ad5-8d2f-905690e9d211.json +++ b/data/helm_mmlu/meta/llama-2-13b/7f37161a-3f1c-4bc4-860f-8fdbf623f63e.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/meta_llama-2-13b/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/meta_llama-2-13b/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,16 +9,23 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Llama 2 (13B)", + "name": "Llama 2 13B", "id": "meta/llama-2-13b", "developer": "meta", "inference_platform": "unknown" }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -65,132 +69,141 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -234,16 +247,25 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -287,16 +309,25 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -490,16 +521,25 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -543,16 +583,25 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -596,16 +645,25 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -649,16 +707,25 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -702,16 +769,25 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -755,16 +831,25 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -898,16 +983,25 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,16 +1045,25 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1004,16 +1107,25 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1057,16 +1169,25 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1110,16 +1231,25 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1163,16 +1293,25 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1216,16 +1355,25 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1269,16 +1417,25 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1322,16 +1479,25 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1765,16 +1931,25 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1848,16 +2023,25 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1901,16 +2085,25 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1954,16 +2147,25 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2007,16 +2209,25 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2060,16 +2271,25 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2113,16 +2333,25 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2166,16 +2395,25 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2219,16 +2457,25 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2302,16 +2549,25 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2355,16 +2611,25 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2408,16 +2673,25 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2461,16 +2735,25 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2514,16 +2797,25 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2567,16 +2859,25 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2921,25 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.502, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/meta/llama-2-70b/ba574f5e-cc59-4994-a595-e6472c032fc4.json b/data/helm_mmlu/meta/llama-2-70b/9da7439c-e96b-444f-b4fa-7ef638080740.json similarity index 78% rename from data/helm_mmlu/meta/llama-2-70b/ba574f5e-cc59-4994-a595-e6472c032fc4.json rename to data/helm_mmlu/meta/llama-2-70b/9da7439c-e96b-444f-b4fa-7ef638080740.json index 2bd647ad6..aa6a9caa2 100644 --- a/data/helm_mmlu/meta/llama-2-70b/ba574f5e-cc59-4994-a595-e6472c032fc4.json +++ b/data/helm_mmlu/meta/llama-2-70b/9da7439c-e96b-444f-b4fa-7ef638080740.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/meta_llama-2-70b/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/meta_llama-2-70b/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,16 +9,23 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Llama 2 (70B)", + "name": "Llama 2 70B", "id": "meta/llama-2-70b", "developer": "meta", "inference_platform": "unknown" }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -65,132 +69,141 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -234,16 +247,25 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -287,16 +309,25 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -490,16 +521,25 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -543,16 +583,25 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -596,16 +645,25 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -649,16 +707,25 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -702,16 +769,25 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -755,16 +831,25 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -898,16 +983,25 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,16 +1045,25 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1004,16 +1107,25 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1057,16 +1169,25 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1110,16 +1231,25 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1163,16 +1293,25 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1216,16 +1355,25 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1269,16 +1417,25 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1322,16 +1479,25 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1765,16 +1931,25 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1848,16 +2023,25 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1901,16 +2085,25 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1954,16 +2147,25 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2007,16 +2209,25 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2060,16 +2271,25 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2113,16 +2333,25 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2166,16 +2395,25 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2219,16 +2457,25 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2302,16 +2549,25 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2355,16 +2611,25 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2408,16 +2673,25 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2461,16 +2735,25 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2514,16 +2797,25 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2567,16 +2859,25 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2921,25 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.508, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/meta/llama-2-7b/9cfa7f91-bfd0-4f02-988c-1978df8db303.json b/data/helm_mmlu/meta/llama-2-7b/294b22a0-1676-4d8c-8ad2-5cdc40267255.json similarity index 78% rename from data/helm_mmlu/meta/llama-2-7b/9cfa7f91-bfd0-4f02-988c-1978df8db303.json rename to data/helm_mmlu/meta/llama-2-7b/294b22a0-1676-4d8c-8ad2-5cdc40267255.json index f7641555c..0649e7329 100644 --- a/data/helm_mmlu/meta/llama-2-7b/9cfa7f91-bfd0-4f02-988c-1978df8db303.json +++ b/data/helm_mmlu/meta/llama-2-7b/294b22a0-1676-4d8c-8ad2-5cdc40267255.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/meta_llama-2-7b/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/meta_llama-2-7b/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,16 +9,23 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Llama 2 (7B)", + "name": "Llama 2 7B", "id": "meta/llama-2-7b", "developer": "meta", "inference_platform": "unknown" }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -65,132 +69,141 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -234,16 +247,25 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -287,16 +309,25 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -490,16 +521,25 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -543,16 +583,25 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -596,16 +645,25 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -649,16 +707,25 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -702,16 +769,25 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -755,16 +831,25 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -898,16 +983,25 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,16 +1045,25 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1004,16 +1107,25 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1057,16 +1169,25 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1110,16 +1231,25 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1163,16 +1293,25 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1216,16 +1355,25 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1269,16 +1417,25 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1322,16 +1479,25 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1765,16 +1931,25 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1848,16 +2023,25 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1901,16 +2085,25 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1954,16 +2147,25 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2007,16 +2209,25 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2060,16 +2271,25 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2113,16 +2333,25 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2166,16 +2395,25 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2219,16 +2457,25 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2302,16 +2549,25 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2355,16 +2611,25 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2408,16 +2673,25 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2461,16 +2735,25 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2514,16 +2797,25 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2567,16 +2859,25 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2921,25 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.681, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/meta/llama-3-70b/607a4b9b-3442-4690-b116-a927c6822fb3.json b/data/helm_mmlu/meta/llama-3-70b/1c11950d-bd2f-407b-928a-5cd33a0a3d6e.json similarity index 78% rename from data/helm_mmlu/meta/llama-3-70b/607a4b9b-3442-4690-b116-a927c6822fb3.json rename to data/helm_mmlu/meta/llama-3-70b/1c11950d-bd2f-407b-928a-5cd33a0a3d6e.json index 028924f0a..4f09a5ee3 100644 --- a/data/helm_mmlu/meta/llama-3-70b/607a4b9b-3442-4690-b116-a927c6822fb3.json +++ b/data/helm_mmlu/meta/llama-3-70b/1c11950d-bd2f-407b-928a-5cd33a0a3d6e.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/meta_llama-3-70b/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/meta_llama-3-70b/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,16 +9,23 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Llama 3 (70B)", + "name": "Llama 3 70B", "id": "meta/llama-3-70b", "developer": "meta", "inference_platform": "unknown" }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -65,132 +69,141 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -234,16 +247,25 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -287,16 +309,25 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -490,16 +521,25 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -543,16 +583,25 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -596,16 +645,25 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -649,16 +707,25 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -702,16 +769,25 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -755,16 +831,25 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -898,16 +983,25 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,16 +1045,25 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1004,16 +1107,25 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1057,16 +1169,25 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1110,16 +1231,25 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1163,16 +1293,25 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1216,16 +1355,25 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1269,16 +1417,25 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1322,16 +1479,25 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1765,16 +1931,25 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1848,16 +2023,25 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1901,16 +2085,25 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1954,16 +2147,25 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2007,16 +2209,25 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2060,16 +2271,25 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2113,16 +2333,25 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2166,16 +2395,25 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2219,16 +2457,25 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2302,16 +2549,25 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2355,16 +2611,25 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2408,16 +2673,25 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2461,16 +2735,25 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2514,16 +2797,25 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2567,16 +2859,25 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2921,25 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.524, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/meta/llama-3-8b/44decfe6-57ed-4677-a859-4fe5ae25b237.json b/data/helm_mmlu/meta/llama-3-8b/78f2484e-bc73-4026-929b-db345e92cf5a.json similarity index 78% rename from data/helm_mmlu/meta/llama-3-8b/44decfe6-57ed-4677-a859-4fe5ae25b237.json rename to data/helm_mmlu/meta/llama-3-8b/78f2484e-bc73-4026-929b-db345e92cf5a.json index 493305a26..83f907e80 100644 --- a/data/helm_mmlu/meta/llama-3-8b/44decfe6-57ed-4677-a859-4fe5ae25b237.json +++ b/data/helm_mmlu/meta/llama-3-8b/78f2484e-bc73-4026-929b-db345e92cf5a.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/meta_llama-3-8b/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/meta_llama-3-8b/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,16 +9,23 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Llama 3 (8B)", + "name": "Llama 3 8B", "id": "meta/llama-3-8b", "developer": "meta", "inference_platform": "unknown" }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -65,132 +69,141 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -234,16 +247,25 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -287,16 +309,25 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -490,16 +521,25 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -543,16 +583,25 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -596,16 +645,25 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -649,16 +707,25 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -702,16 +769,25 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -755,16 +831,25 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -898,16 +983,25 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,16 +1045,25 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1004,16 +1107,25 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1057,16 +1169,25 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1110,16 +1231,25 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1163,16 +1293,25 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1216,16 +1355,25 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1269,16 +1417,25 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1322,16 +1479,25 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1765,16 +1931,25 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1848,16 +2023,25 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1901,16 +2085,25 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1954,16 +2147,25 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2007,16 +2209,25 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2060,16 +2271,25 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2113,16 +2333,25 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2166,16 +2395,25 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2219,16 +2457,25 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2302,16 +2549,25 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2355,16 +2611,25 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2408,16 +2673,25 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2461,16 +2735,25 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2514,16 +2797,25 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2567,16 +2859,25 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2921,25 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.733, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/meta/llama-3.1-405b-instruct-turbo/af78c3b5-5d91-431d-85ac-783b5a324723.json b/data/helm_mmlu/meta/llama-3.1-405b-instruct-turbo/8ddf9de8-2ee3-4a30-9250-30fd027c63b4.json similarity index 78% rename from data/helm_mmlu/meta/llama-3.1-405b-instruct-turbo/af78c3b5-5d91-431d-85ac-783b5a324723.json rename to data/helm_mmlu/meta/llama-3.1-405b-instruct-turbo/8ddf9de8-2ee3-4a30-9250-30fd027c63b4.json index 5e68e1b5a..c4ce37e9d 100644 --- a/data/helm_mmlu/meta/llama-3.1-405b-instruct-turbo/af78c3b5-5d91-431d-85ac-783b5a324723.json +++ b/data/helm_mmlu/meta/llama-3.1-405b-instruct-turbo/8ddf9de8-2ee3-4a30-9250-30fd027c63b4.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/meta_llama-3.1-405b-instruct-turbo/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/meta_llama-3.1-405b-instruct-turbo/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,16 +9,23 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Llama 3.1 Instruct Turbo (405B)", + "name": "Llama 3.1 Instruct Turbo 405B", "id": "meta/llama-3.1-405b-instruct-turbo", "developer": "meta", "inference_platform": "unknown" }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -65,132 +69,141 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -234,16 +247,25 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -287,16 +309,25 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -490,16 +521,25 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -543,16 +583,25 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -596,16 +645,25 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -649,16 +707,25 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -702,16 +769,25 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -755,16 +831,25 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -898,16 +983,25 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,16 +1045,25 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1004,16 +1107,25 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1057,16 +1169,25 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1110,16 +1231,25 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1163,16 +1293,25 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1216,16 +1355,25 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1269,16 +1417,25 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1322,16 +1479,25 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1765,16 +1931,25 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1848,16 +2023,25 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1901,16 +2085,25 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1954,16 +2147,25 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2007,16 +2209,25 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2060,16 +2271,25 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2113,16 +2333,25 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2166,16 +2395,25 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2219,16 +2457,25 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2302,16 +2549,25 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2355,16 +2611,25 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2408,16 +2673,25 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2461,16 +2735,25 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2514,16 +2797,25 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2567,16 +2859,25 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2921,25 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.33, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/meta/llama-3.1-70b-instruct-turbo/1224cee0-22f8-41b0-a7da-8a6100001a3e.json b/data/helm_mmlu/meta/llama-3.1-70b-instruct-turbo/41af381a-3637-4578-a582-59d9b1327d95.json similarity index 78% rename from data/helm_mmlu/meta/llama-3.1-70b-instruct-turbo/1224cee0-22f8-41b0-a7da-8a6100001a3e.json rename to data/helm_mmlu/meta/llama-3.1-70b-instruct-turbo/41af381a-3637-4578-a582-59d9b1327d95.json index 7f880e52b..0e4b849f9 100644 --- a/data/helm_mmlu/meta/llama-3.1-70b-instruct-turbo/1224cee0-22f8-41b0-a7da-8a6100001a3e.json +++ b/data/helm_mmlu/meta/llama-3.1-70b-instruct-turbo/41af381a-3637-4578-a582-59d9b1327d95.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/meta_llama-3.1-70b-instruct-turbo/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/meta_llama-3.1-70b-instruct-turbo/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,16 +9,23 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Llama 3.1 Instruct Turbo (70B)", + "name": "Llama 3.1 Instruct Turbo 70B", "id": "meta/llama-3.1-70b-instruct-turbo", "developer": "meta", "inference_platform": "unknown" }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -65,132 +69,141 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -234,16 +247,25 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -287,16 +309,25 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -490,16 +521,25 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -543,16 +583,25 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -596,16 +645,25 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -649,16 +707,25 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -702,16 +769,25 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -755,16 +831,25 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -898,16 +983,25 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,16 +1045,25 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1004,16 +1107,25 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1057,16 +1169,25 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1110,16 +1231,25 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1163,16 +1293,25 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1216,16 +1355,25 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1269,16 +1417,25 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1322,16 +1479,25 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1765,16 +1931,25 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1848,16 +2023,25 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1901,16 +2085,25 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1954,16 +2147,25 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2007,16 +2209,25 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2060,16 +2271,25 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2113,16 +2333,25 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2166,16 +2395,25 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2219,16 +2457,25 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2302,16 +2549,25 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2355,16 +2611,25 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2408,16 +2673,25 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2461,16 +2735,25 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2514,16 +2797,25 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2567,16 +2859,25 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2921,25 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.021, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/meta/llama-3.1-8b-instruct-turbo/2cb2551b-dbca-46d9-a19a-165d1ac60dee.json b/data/helm_mmlu/meta/llama-3.1-8b-instruct-turbo/96544ff3-225e-4f8f-82fb-2e3c42d5ba89.json similarity index 78% rename from data/helm_mmlu/meta/llama-3.1-8b-instruct-turbo/2cb2551b-dbca-46d9-a19a-165d1ac60dee.json rename to data/helm_mmlu/meta/llama-3.1-8b-instruct-turbo/96544ff3-225e-4f8f-82fb-2e3c42d5ba89.json index bdc0510b6..6c1d661d4 100644 --- a/data/helm_mmlu/meta/llama-3.1-8b-instruct-turbo/2cb2551b-dbca-46d9-a19a-165d1ac60dee.json +++ b/data/helm_mmlu/meta/llama-3.1-8b-instruct-turbo/96544ff3-225e-4f8f-82fb-2e3c42d5ba89.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/meta_llama-3.1-8b-instruct-turbo/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/meta_llama-3.1-8b-instruct-turbo/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,16 +9,23 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Llama 3.1 Instruct Turbo (8B)", + "name": "Llama 3.1 Instruct Turbo 8B", "id": "meta/llama-3.1-8b-instruct-turbo", "developer": "meta", "inference_platform": "unknown" }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -65,132 +69,141 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -234,16 +247,25 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -287,16 +309,25 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -490,16 +521,25 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -543,16 +583,25 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -596,16 +645,25 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -649,16 +707,25 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -702,16 +769,25 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -755,16 +831,25 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -898,16 +983,25 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,16 +1045,25 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1004,16 +1107,25 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1057,16 +1169,25 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1110,16 +1231,25 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1163,16 +1293,25 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1216,16 +1355,25 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1269,16 +1417,25 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1322,16 +1479,25 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1765,16 +1931,25 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1848,16 +2023,25 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1901,16 +2085,25 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1954,16 +2147,25 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2007,16 +2209,25 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2060,16 +2271,25 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2113,16 +2333,25 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2166,16 +2395,25 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2219,16 +2457,25 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2302,16 +2549,25 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2355,16 +2611,25 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2408,16 +2673,25 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2461,16 +2735,25 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2514,16 +2797,25 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2567,16 +2859,25 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2921,25 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.475, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/meta/llama-3.2-11b-vision-instruct-turbo/3c53ce3d-4ee8-483c-be9f-964395103289.json b/data/helm_mmlu/meta/llama-3.2-11b-vision-instruct-turbo/bb6fd9af-5dd0-4590-b6ea-7687029ca18c.json similarity index 78% rename from data/helm_mmlu/meta/llama-3.2-11b-vision-instruct-turbo/3c53ce3d-4ee8-483c-be9f-964395103289.json rename to data/helm_mmlu/meta/llama-3.2-11b-vision-instruct-turbo/bb6fd9af-5dd0-4590-b6ea-7687029ca18c.json index e9ec2f904..599cd6855 100644 --- a/data/helm_mmlu/meta/llama-3.2-11b-vision-instruct-turbo/3c53ce3d-4ee8-483c-be9f-964395103289.json +++ b/data/helm_mmlu/meta/llama-3.2-11b-vision-instruct-turbo/bb6fd9af-5dd0-4590-b6ea-7687029ca18c.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/meta_llama-3.2-11b-vision-instruct-turbo/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/meta_llama-3.2-11b-vision-instruct-turbo/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,16 +9,23 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Llama 3.2 Vision Instruct Turbo (11B)", + "name": "Llama 3.2 Vision Instruct Turbo 11B", "id": "meta/llama-3.2-11b-vision-instruct-turbo", "developer": "meta", "inference_platform": "unknown" }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -65,132 +69,141 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -234,16 +247,25 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -287,16 +309,25 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -490,16 +521,25 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -543,16 +583,25 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -596,16 +645,25 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -649,16 +707,25 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -702,16 +769,25 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -755,16 +831,25 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -898,16 +983,25 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,16 +1045,25 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1004,16 +1107,25 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1057,16 +1169,25 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1110,16 +1231,25 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1163,16 +1293,25 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1216,16 +1355,25 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1269,16 +1417,25 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1322,16 +1479,25 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1765,16 +1931,25 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1848,16 +2023,25 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1901,16 +2085,25 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1954,16 +2147,25 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2007,16 +2209,25 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2060,16 +2271,25 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2113,16 +2333,25 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2166,16 +2395,25 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2219,16 +2457,25 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2302,16 +2549,25 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2355,16 +2611,25 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2408,16 +2673,25 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2461,16 +2735,25 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2514,16 +2797,25 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2567,16 +2859,25 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2921,25 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.897, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/meta/llama-3.2-90b-vision-instruct-turbo/11e364be-39e9-4b42-97d7-ab771f17973c.json b/data/helm_mmlu/meta/llama-3.2-90b-vision-instruct-turbo/e036de72-b425-4aa5-9448-dc52560e60db.json similarity index 78% rename from data/helm_mmlu/meta/llama-3.2-90b-vision-instruct-turbo/11e364be-39e9-4b42-97d7-ab771f17973c.json rename to data/helm_mmlu/meta/llama-3.2-90b-vision-instruct-turbo/e036de72-b425-4aa5-9448-dc52560e60db.json index 51cb25f1e..f14700c78 100644 --- a/data/helm_mmlu/meta/llama-3.2-90b-vision-instruct-turbo/11e364be-39e9-4b42-97d7-ab771f17973c.json +++ b/data/helm_mmlu/meta/llama-3.2-90b-vision-instruct-turbo/e036de72-b425-4aa5-9448-dc52560e60db.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/meta_llama-3.2-90b-vision-instruct-turbo/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/meta_llama-3.2-90b-vision-instruct-turbo/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,16 +9,23 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Llama 3.2 Vision Instruct Turbo (90B)", + "name": "Llama 3.2 Vision Instruct Turbo 90B", "id": "meta/llama-3.2-90b-vision-instruct-turbo", "developer": "meta", "inference_platform": "unknown" }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -65,132 +69,141 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -234,16 +247,25 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -287,16 +309,25 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -490,16 +521,25 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -543,16 +583,25 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -596,16 +645,25 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -649,16 +707,25 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -702,16 +769,25 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -755,16 +831,25 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -898,16 +983,25 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,16 +1045,25 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1004,16 +1107,25 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1057,16 +1169,25 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1110,16 +1231,25 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1163,16 +1293,25 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1216,16 +1355,25 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1269,16 +1417,25 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1322,16 +1479,25 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1765,16 +1931,25 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1848,16 +2023,25 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1901,16 +2085,25 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1954,16 +2147,25 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2007,16 +2209,25 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2060,16 +2271,25 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2113,16 +2333,25 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2166,16 +2395,25 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2219,16 +2457,25 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2302,16 +2549,25 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2355,16 +2611,25 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2408,16 +2673,25 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2461,16 +2735,25 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2514,16 +2797,25 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2567,16 +2859,25 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2921,25 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.773, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/meta/llama-3.3-70b-instruct-turbo/bbcf8f14-600c-4c93-b63d-64aabcab23a3.json b/data/helm_mmlu/meta/llama-3.3-70b-instruct-turbo/65423181-18f1-4296-98c2-171356106404.json similarity index 78% rename from data/helm_mmlu/meta/llama-3.3-70b-instruct-turbo/bbcf8f14-600c-4c93-b63d-64aabcab23a3.json rename to data/helm_mmlu/meta/llama-3.3-70b-instruct-turbo/65423181-18f1-4296-98c2-171356106404.json index 124028675..faf8ae128 100644 --- a/data/helm_mmlu/meta/llama-3.3-70b-instruct-turbo/bbcf8f14-600c-4c93-b63d-64aabcab23a3.json +++ b/data/helm_mmlu/meta/llama-3.3-70b-instruct-turbo/65423181-18f1-4296-98c2-171356106404.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/meta_llama-3.3-70b-instruct-turbo/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/meta_llama-3.3-70b-instruct-turbo/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,16 +9,23 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Llama 3.3 Instruct Turbo (70B)", + "name": "Llama 3.3 Instruct Turbo 70B", "id": "meta/llama-3.3-70b-instruct-turbo", "developer": "meta", "inference_platform": "unknown" }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -65,132 +69,141 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -234,16 +247,25 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -287,16 +309,25 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -490,16 +521,25 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -543,16 +583,25 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -596,16 +645,25 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -649,16 +707,25 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -702,16 +769,25 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -755,16 +831,25 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -898,16 +983,25 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,16 +1045,25 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1004,16 +1107,25 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1057,16 +1169,25 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1110,16 +1231,25 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1163,16 +1293,25 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1216,16 +1355,25 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1269,16 +1417,25 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1322,16 +1479,25 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1765,16 +1931,25 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1848,16 +2023,25 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1901,16 +2085,25 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1954,16 +2147,25 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2007,16 +2209,25 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2060,16 +2271,25 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2113,16 +2333,25 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2166,16 +2395,25 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2219,16 +2457,25 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2302,16 +2549,25 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2355,16 +2611,25 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2408,16 +2673,25 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2461,16 +2735,25 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2514,16 +2797,25 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2567,16 +2859,25 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2921,25 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.722, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/microsoft/phi-2/91bf0cf5-2010-4226-8b3e-d6ca019ce5b3.json b/data/helm_mmlu/microsoft/phi-2/41c3f46d-c798-422c-8b6a-b176ffa8e8ae.json similarity index 78% rename from data/helm_mmlu/microsoft/phi-2/91bf0cf5-2010-4226-8b3e-d6ca019ce5b3.json rename to data/helm_mmlu/microsoft/phi-2/41c3f46d-c798-422c-8b6a-b176ffa8e8ae.json index f3162d0fe..95bd9f1b8 100644 --- a/data/helm_mmlu/microsoft/phi-2/91bf0cf5-2010-4226-8b3e-d6ca019ce5b3.json +++ b/data/helm_mmlu/microsoft/phi-2/41c3f46d-c798-422c-8b6a-b176ffa8e8ae.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/microsoft_phi-2/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/microsoft_phi-2/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -19,9 +16,16 @@ }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -65,132 +69,141 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -234,16 +247,25 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -287,16 +309,25 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -490,16 +521,25 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -543,16 +583,25 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -596,16 +645,25 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -649,16 +707,25 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -702,16 +769,25 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -755,16 +831,25 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -898,16 +983,25 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,16 +1045,25 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1004,16 +1107,25 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1057,16 +1169,25 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1110,16 +1231,25 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1163,16 +1293,25 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1216,16 +1355,25 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1269,16 +1417,25 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1322,16 +1479,25 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1765,16 +1931,25 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1848,16 +2023,25 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1901,16 +2085,25 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1954,16 +2147,25 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2007,16 +2209,25 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2060,16 +2271,25 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2113,16 +2333,25 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2166,16 +2395,25 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2219,16 +2457,25 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2302,16 +2549,25 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2355,16 +2611,25 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2408,16 +2673,25 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2461,16 +2735,25 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2514,16 +2797,25 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2567,16 +2859,25 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2921,25 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.824, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/microsoft/phi-3-medium-4k-instruct/e58fb5ca-803c-4ac8-b392-1b9c9c8bb065.json b/data/helm_mmlu/microsoft/phi-3-medium-4k-instruct/f78d6e0a-a397-4a41-a37e-696bda5a1987.json similarity index 78% rename from data/helm_mmlu/microsoft/phi-3-medium-4k-instruct/e58fb5ca-803c-4ac8-b392-1b9c9c8bb065.json rename to data/helm_mmlu/microsoft/phi-3-medium-4k-instruct/f78d6e0a-a397-4a41-a37e-696bda5a1987.json index 97f9c3c96..f1d62a268 100644 --- a/data/helm_mmlu/microsoft/phi-3-medium-4k-instruct/e58fb5ca-803c-4ac8-b392-1b9c9c8bb065.json +++ b/data/helm_mmlu/microsoft/phi-3-medium-4k-instruct/f78d6e0a-a397-4a41-a37e-696bda5a1987.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/microsoft_phi-3-medium-4k-instruct/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/microsoft_phi-3-medium-4k-instruct/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,16 +9,23 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Phi-3 (14B)", + "name": "Phi-3 14B", "id": "microsoft/phi-3-medium-4k-instruct", "developer": "microsoft", "inference_platform": "unknown" }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -65,132 +69,141 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -234,16 +247,25 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -287,16 +309,25 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -490,16 +521,25 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -543,16 +583,25 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -596,16 +645,25 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -649,16 +707,25 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -702,16 +769,25 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -755,16 +831,25 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -898,16 +983,25 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,16 +1045,25 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1004,16 +1107,25 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1057,16 +1169,25 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1110,16 +1231,25 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1163,16 +1293,25 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1216,16 +1355,25 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1269,16 +1417,25 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1322,16 +1479,25 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1765,16 +1931,25 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1848,16 +2023,25 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1901,16 +2085,25 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1954,16 +2147,25 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2007,16 +2209,25 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2060,16 +2271,25 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2113,16 +2333,25 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2166,16 +2395,25 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2219,16 +2457,25 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2302,16 +2549,25 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2355,16 +2611,25 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2408,16 +2673,25 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2461,16 +2735,25 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2514,16 +2797,25 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2567,16 +2859,25 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2921,25 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.015, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/microsoft/phi-3-small-8k-instruct/16c66bdf-dda3-4b12-b38c-73abee6a702f.json b/data/helm_mmlu/microsoft/phi-3-small-8k-instruct/d2bf70ce-341f-49d7-bd03-87b523826953.json similarity index 78% rename from data/helm_mmlu/microsoft/phi-3-small-8k-instruct/16c66bdf-dda3-4b12-b38c-73abee6a702f.json rename to data/helm_mmlu/microsoft/phi-3-small-8k-instruct/d2bf70ce-341f-49d7-bd03-87b523826953.json index 9da3cad91..bbe3afca0 100644 --- a/data/helm_mmlu/microsoft/phi-3-small-8k-instruct/16c66bdf-dda3-4b12-b38c-73abee6a702f.json +++ b/data/helm_mmlu/microsoft/phi-3-small-8k-instruct/d2bf70ce-341f-49d7-bd03-87b523826953.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/microsoft_phi-3-small-8k-instruct/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/microsoft_phi-3-small-8k-instruct/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,16 +9,23 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Phi-3 (7B)", + "name": "Phi-3 7B", "id": "microsoft/phi-3-small-8k-instruct", "developer": "microsoft", "inference_platform": "unknown" }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -65,132 +69,141 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -234,16 +247,25 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -287,16 +309,25 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -490,16 +521,25 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -543,16 +583,25 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -596,16 +645,25 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -649,16 +707,25 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -702,16 +769,25 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -755,16 +831,25 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -898,16 +983,25 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,16 +1045,25 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1004,16 +1107,25 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1057,16 +1169,25 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1110,16 +1231,25 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1163,16 +1293,25 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1216,16 +1355,25 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1269,16 +1417,25 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1322,16 +1479,25 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1765,16 +1931,25 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1848,16 +2023,25 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1901,16 +2085,25 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1954,16 +2147,25 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2007,16 +2209,25 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2060,16 +2271,25 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2113,16 +2333,25 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2166,16 +2395,25 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2219,16 +2457,25 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2302,16 +2549,25 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2355,16 +2611,25 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2408,16 +2673,25 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2461,16 +2735,25 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2514,16 +2797,25 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2567,16 +2859,25 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2921,25 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.708, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/mistralai/mistral-7b-instruct-v0.3/d0783259-681a-438f-b7dc-1c625a0be8ba.json b/data/helm_mmlu/mistralai/mistral-7b-instruct-v0.3/b20860aa-fb88-46b8-a79b-fa71a79c7d4d.json similarity index 78% rename from data/helm_mmlu/mistralai/mistral-7b-instruct-v0.3/d0783259-681a-438f-b7dc-1c625a0be8ba.json rename to data/helm_mmlu/mistralai/mistral-7b-instruct-v0.3/b20860aa-fb88-46b8-a79b-fa71a79c7d4d.json index 2592b75a7..e788149e1 100644 --- a/data/helm_mmlu/mistralai/mistral-7b-instruct-v0.3/d0783259-681a-438f-b7dc-1c625a0be8ba.json +++ b/data/helm_mmlu/mistralai/mistral-7b-instruct-v0.3/b20860aa-fb88-46b8-a79b-fa71a79c7d4d.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/mistralai_mistral-7b-instruct-v0.3/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/mistralai_mistral-7b-instruct-v0.3/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,16 +9,23 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Mistral Instruct v0.3 (7B)", + "name": "Mistral Instruct v0.3 7B", "id": "mistralai/mistral-7b-instruct-v0.3", "developer": "mistralai", "inference_platform": "unknown" }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -65,132 +69,141 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -234,16 +247,25 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -287,16 +309,25 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -490,16 +521,25 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -543,16 +583,25 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -596,16 +645,25 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -649,16 +707,25 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -702,16 +769,25 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -755,16 +831,25 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -898,16 +983,25 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,16 +1045,25 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1004,16 +1107,25 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1057,16 +1169,25 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1110,16 +1231,25 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1163,16 +1293,25 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1216,16 +1355,25 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1269,16 +1417,25 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1322,16 +1479,25 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1765,16 +1931,25 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1848,16 +2023,25 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1901,16 +2085,25 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1954,16 +2147,25 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2007,16 +2209,25 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2060,16 +2271,25 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2113,16 +2333,25 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2166,16 +2395,25 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2219,16 +2457,25 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2302,16 +2549,25 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2355,16 +2611,25 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2408,16 +2673,25 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2461,16 +2735,25 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2514,16 +2797,25 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2567,16 +2859,25 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2921,25 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.509, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/mistralai/mistral-7b-v0.1/a05ce725-cdf0-4fe3-88b9-8631229e4443.json b/data/helm_mmlu/mistralai/mistral-7b-v0.1/08590b6e-7050-413d-844b-1f3f1c5aa444.json similarity index 78% rename from data/helm_mmlu/mistralai/mistral-7b-v0.1/a05ce725-cdf0-4fe3-88b9-8631229e4443.json rename to data/helm_mmlu/mistralai/mistral-7b-v0.1/08590b6e-7050-413d-844b-1f3f1c5aa444.json index 77ee3f1a1..5ca508d3b 100644 --- a/data/helm_mmlu/mistralai/mistral-7b-v0.1/a05ce725-cdf0-4fe3-88b9-8631229e4443.json +++ b/data/helm_mmlu/mistralai/mistral-7b-v0.1/08590b6e-7050-413d-844b-1f3f1c5aa444.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/mistralai_mistral-7b-v0.1/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/mistralai_mistral-7b-v0.1/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,16 +9,23 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Mistral v0.1 (7B)", + "name": "Mistral v0.1 7B", "id": "mistralai/mistral-7b-v0.1", "developer": "mistralai", "inference_platform": "unknown" }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -65,132 +69,141 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -234,16 +247,25 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -287,16 +309,25 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -490,16 +521,25 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -543,16 +583,25 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -596,16 +645,25 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -649,16 +707,25 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -702,16 +769,25 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -755,16 +831,25 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -898,16 +983,25 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,16 +1045,25 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1004,16 +1107,25 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1057,16 +1169,25 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1110,16 +1231,25 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1163,16 +1293,25 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1216,16 +1355,25 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1269,16 +1417,25 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1322,16 +1479,25 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1765,16 +1931,25 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1848,16 +2023,25 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1901,16 +2085,25 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1954,16 +2147,25 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2007,16 +2209,25 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2060,16 +2271,25 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2113,16 +2333,25 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2166,16 +2395,25 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2219,16 +2457,25 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2302,16 +2549,25 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2355,16 +2611,25 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2408,16 +2673,25 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2461,16 +2735,25 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2514,16 +2797,25 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2567,16 +2859,25 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2921,25 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.213, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/mistralai/mistral-large-2402/0dee4200-c4f0-438e-8d0d-ca92515c6e33.json b/data/helm_mmlu/mistralai/mistral-large-2402/2d18fd88-73b5-4d4c-a1cc-e66a20316605.json similarity index 78% rename from data/helm_mmlu/mistralai/mistral-large-2402/0dee4200-c4f0-438e-8d0d-ca92515c6e33.json rename to data/helm_mmlu/mistralai/mistral-large-2402/2d18fd88-73b5-4d4c-a1cc-e66a20316605.json index c34e3e47f..6b7873124 100644 --- a/data/helm_mmlu/mistralai/mistral-large-2402/0dee4200-c4f0-438e-8d0d-ca92515c6e33.json +++ b/data/helm_mmlu/mistralai/mistral-large-2402/2d18fd88-73b5-4d4c-a1cc-e66a20316605.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/mistralai_mistral-large-2402/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/mistralai_mistral-large-2402/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,16 +9,23 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Mistral Large (2402)", + "name": "Mistral Large 2402", "id": "mistralai/mistral-large-2402", "developer": "mistralai", "inference_platform": "unknown" }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -65,132 +69,141 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -234,16 +247,25 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -287,16 +309,25 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -490,16 +521,25 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -543,16 +583,25 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -596,16 +645,25 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -649,16 +707,25 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -702,16 +769,25 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -755,16 +831,25 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -898,16 +983,25 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,16 +1045,25 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1004,16 +1107,25 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1057,16 +1169,25 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1110,16 +1231,25 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1163,16 +1293,25 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1216,16 +1355,25 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1269,16 +1417,25 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1322,16 +1479,25 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1765,16 +1931,25 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1848,16 +2023,25 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1901,16 +2085,25 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1954,16 +2147,25 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2007,16 +2209,25 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2060,16 +2271,25 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2113,16 +2333,25 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2166,16 +2395,25 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2219,16 +2457,25 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2302,16 +2549,25 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2355,16 +2611,25 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2408,16 +2673,25 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2461,16 +2735,25 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2514,16 +2797,25 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2567,16 +2859,25 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2921,25 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.464, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/mistralai/mistral-large-2407/2869d585-567d-4ddc-ac38-3e036061b13e.json b/data/helm_mmlu/mistralai/mistral-large-2407/567918be-be6f-4e41-b613-727828fe8a44.json similarity index 78% rename from data/helm_mmlu/mistralai/mistral-large-2407/2869d585-567d-4ddc-ac38-3e036061b13e.json rename to data/helm_mmlu/mistralai/mistral-large-2407/567918be-be6f-4e41-b613-727828fe8a44.json index 4e005a631..58aa6a379 100644 --- a/data/helm_mmlu/mistralai/mistral-large-2407/2869d585-567d-4ddc-ac38-3e036061b13e.json +++ b/data/helm_mmlu/mistralai/mistral-large-2407/567918be-be6f-4e41-b613-727828fe8a44.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/mistralai_mistral-large-2407/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/mistralai_mistral-large-2407/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,16 +9,23 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Mistral Large 2 (2407)", + "name": "Mistral Large 2 2407", "id": "mistralai/mistral-large-2407", "developer": "mistralai", "inference_platform": "unknown" }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -65,132 +69,141 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -234,16 +247,25 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -287,16 +309,25 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -490,16 +521,25 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -543,16 +583,25 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -596,16 +645,25 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -649,16 +707,25 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -702,16 +769,25 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -755,16 +831,25 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -898,16 +983,25 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,16 +1045,25 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1004,16 +1107,25 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1057,16 +1169,25 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1110,16 +1231,25 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1163,16 +1293,25 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1216,16 +1355,25 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1269,16 +1417,25 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1322,16 +1479,25 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1765,16 +1931,25 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1848,16 +2023,25 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1901,16 +2085,25 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1954,16 +2147,25 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2007,16 +2209,25 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2060,16 +2271,25 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2113,16 +2333,25 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2166,16 +2395,25 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2219,16 +2457,25 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2302,16 +2549,25 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2355,16 +2611,25 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2408,16 +2673,25 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2461,16 +2735,25 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2514,16 +2797,25 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2567,16 +2859,25 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2921,25 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.24, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/mistralai/mistral-small-2402/d277cca3-64da-4e4b-9210-3f5b910c975c.json b/data/helm_mmlu/mistralai/mistral-small-2402/c2be131b-808c-4947-b24f-69ef6af499d7.json similarity index 78% rename from data/helm_mmlu/mistralai/mistral-small-2402/d277cca3-64da-4e4b-9210-3f5b910c975c.json rename to data/helm_mmlu/mistralai/mistral-small-2402/c2be131b-808c-4947-b24f-69ef6af499d7.json index ddc506063..457d9ed2a 100644 --- a/data/helm_mmlu/mistralai/mistral-small-2402/d277cca3-64da-4e4b-9210-3f5b910c975c.json +++ b/data/helm_mmlu/mistralai/mistral-small-2402/c2be131b-808c-4947-b24f-69ef6af499d7.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/mistralai_mistral-small-2402/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/mistralai_mistral-small-2402/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,16 +9,23 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Mistral Small (2402)", + "name": "Mistral Small 2402", "id": "mistralai/mistral-small-2402", "developer": "mistralai", "inference_platform": "unknown" }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -65,132 +69,141 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -234,16 +247,25 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -287,16 +309,25 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -490,16 +521,25 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -543,16 +583,25 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -596,16 +645,25 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -649,16 +707,25 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -702,16 +769,25 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -755,16 +831,25 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -898,16 +983,25 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,16 +1045,25 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1004,16 +1107,25 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1057,16 +1169,25 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1110,16 +1231,25 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1163,16 +1293,25 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1216,16 +1355,25 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1269,16 +1417,25 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1322,16 +1479,25 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1765,16 +1931,25 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1848,16 +2023,25 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1901,16 +2085,25 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1954,16 +2147,25 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2007,16 +2209,25 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2060,16 +2271,25 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2113,16 +2333,25 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2166,16 +2395,25 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2219,16 +2457,25 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2302,16 +2549,25 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2355,16 +2611,25 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2408,16 +2673,25 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2461,16 +2735,25 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2514,16 +2797,25 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2567,16 +2859,25 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2921,25 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.54, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/mistralai/mixtral-8x22b/cebd1e82-0053-4541-bdf4-5a4fa0736a8a.json b/data/helm_mmlu/mistralai/mixtral-8x22b/24955250-a2e9-475f-a866-30a835579e03.json similarity index 78% rename from data/helm_mmlu/mistralai/mixtral-8x22b/cebd1e82-0053-4541-bdf4-5a4fa0736a8a.json rename to data/helm_mmlu/mistralai/mixtral-8x22b/24955250-a2e9-475f-a866-30a835579e03.json index 35cc50f7b..c7ab33c35 100644 --- a/data/helm_mmlu/mistralai/mixtral-8x22b/cebd1e82-0053-4541-bdf4-5a4fa0736a8a.json +++ b/data/helm_mmlu/mistralai/mixtral-8x22b/24955250-a2e9-475f-a866-30a835579e03.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/mistralai_mixtral-8x22b/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/mistralai_mixtral-8x22b/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,16 +9,23 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Mixtral (8x22B)", + "name": "Mixtral 8x22B", "id": "mistralai/mixtral-8x22b", "developer": "mistralai", "inference_platform": "unknown" }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -65,132 +69,141 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -234,16 +247,25 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -287,16 +309,25 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -490,16 +521,25 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -543,16 +583,25 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -596,16 +645,25 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -649,16 +707,25 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -702,16 +769,25 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -755,16 +831,25 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -898,16 +983,25 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,16 +1045,25 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1004,16 +1107,25 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1057,16 +1169,25 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1110,16 +1231,25 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1163,16 +1293,25 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1216,16 +1355,25 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1269,16 +1417,25 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1322,16 +1479,25 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1765,16 +1931,25 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1848,16 +2023,25 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1901,16 +2085,25 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1954,16 +2147,25 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2007,16 +2209,25 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2060,16 +2271,25 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2113,16 +2333,25 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2166,16 +2395,25 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2219,16 +2457,25 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2302,16 +2549,25 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2355,16 +2611,25 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2408,16 +2673,25 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2461,16 +2735,25 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2514,16 +2797,25 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2567,16 +2859,25 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2921,25 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.598, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/mistralai/mixtral-8x7b-32kseqlen/0f6762ed-e462-4ce7-86ea-dfc3a634d97c.json b/data/helm_mmlu/mistralai/mixtral-8x7b-32kseqlen/de6f7e19-b54a-4bd3-b624-29f66afbee15.json similarity index 78% rename from data/helm_mmlu/mistralai/mixtral-8x7b-32kseqlen/0f6762ed-e462-4ce7-86ea-dfc3a634d97c.json rename to data/helm_mmlu/mistralai/mixtral-8x7b-32kseqlen/de6f7e19-b54a-4bd3-b624-29f66afbee15.json index 247f8572e..3ed7c6104 100644 --- a/data/helm_mmlu/mistralai/mixtral-8x7b-32kseqlen/0f6762ed-e462-4ce7-86ea-dfc3a634d97c.json +++ b/data/helm_mmlu/mistralai/mixtral-8x7b-32kseqlen/de6f7e19-b54a-4bd3-b624-29f66afbee15.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/mistralai_mixtral-8x7b-32kseqlen/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/mistralai_mixtral-8x7b-32kseqlen/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,16 +9,23 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Mixtral (8x7B 32K seqlen)", + "name": "Mixtral 8x7B 32K seqlen", "id": "mistralai/mixtral-8x7b-32kseqlen", "developer": "mistralai", "inference_platform": "unknown" }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -65,132 +69,141 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -234,16 +247,25 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -287,16 +309,25 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -490,16 +521,25 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -543,16 +583,25 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -596,16 +645,25 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -649,16 +707,25 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -702,16 +769,25 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -755,16 +831,25 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -898,16 +983,25 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,16 +1045,25 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1004,16 +1107,25 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1057,16 +1169,25 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1110,16 +1231,25 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1163,16 +1293,25 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1216,16 +1355,25 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1269,16 +1417,25 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1322,16 +1479,25 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1765,16 +1931,25 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1848,16 +2023,25 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1901,16 +2085,25 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1954,16 +2147,25 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2007,16 +2209,25 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2060,16 +2271,25 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2113,16 +2333,25 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2166,16 +2395,25 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2219,16 +2457,25 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2302,16 +2549,25 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2355,16 +2611,25 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2408,16 +2673,25 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2461,16 +2735,25 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2514,16 +2797,25 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2567,16 +2859,25 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2921,25 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.689, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/mistralai/open-mistral-nemo-2407/87bd4fa2-0c5c-4b6a-8386-e84f1cdd9066.json b/data/helm_mmlu/mistralai/open-mistral-nemo-2407/e4c3032d-04e0-414b-a7e9-e30756d82000.json similarity index 78% rename from data/helm_mmlu/mistralai/open-mistral-nemo-2407/87bd4fa2-0c5c-4b6a-8386-e84f1cdd9066.json rename to data/helm_mmlu/mistralai/open-mistral-nemo-2407/e4c3032d-04e0-414b-a7e9-e30756d82000.json index 20e5d8bc5..e5aec6b67 100644 --- a/data/helm_mmlu/mistralai/open-mistral-nemo-2407/87bd4fa2-0c5c-4b6a-8386-e84f1cdd9066.json +++ b/data/helm_mmlu/mistralai/open-mistral-nemo-2407/e4c3032d-04e0-414b-a7e9-e30756d82000.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/mistralai_open-mistral-nemo-2407/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/mistralai_open-mistral-nemo-2407/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,16 +9,23 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Mistral NeMo (2402)", + "name": "Mistral NeMo 2402", "id": "mistralai/open-mistral-nemo-2407", "developer": "mistralai", "inference_platform": "unknown" }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -65,132 +69,141 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -234,16 +247,25 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -287,16 +309,25 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -490,16 +521,25 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -543,16 +583,25 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -596,16 +645,25 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -649,16 +707,25 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -702,16 +769,25 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -755,16 +831,25 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -898,16 +983,25 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,16 +1045,25 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1004,16 +1107,25 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1057,16 +1169,25 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1110,16 +1231,25 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1163,16 +1293,25 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1216,16 +1355,25 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1269,16 +1417,25 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1322,16 +1479,25 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1765,16 +1931,25 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1848,16 +2023,25 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1901,16 +2085,25 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1954,16 +2147,25 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2007,16 +2209,25 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2060,16 +2271,25 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2113,16 +2333,25 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2166,16 +2395,25 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2219,16 +2457,25 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2302,16 +2549,25 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2355,16 +2611,25 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2408,16 +2673,25 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2461,16 +2735,25 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2514,16 +2797,25 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2567,16 +2859,25 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2921,25 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.215, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/openai/gpt-3.5-turbo-0125/48a0dd6b-9304-460a-8e4e-420c60dfa854.json b/data/helm_mmlu/openai/gpt-3.5-turbo-0125/e9a41d4b-56c7-47f0-b439-72ad1e463000.json similarity index 78% rename from data/helm_mmlu/openai/gpt-3.5-turbo-0125/48a0dd6b-9304-460a-8e4e-420c60dfa854.json rename to data/helm_mmlu/openai/gpt-3.5-turbo-0125/e9a41d4b-56c7-47f0-b439-72ad1e463000.json index 61bdc2a92..e429d6dbc 100644 --- a/data/helm_mmlu/openai/gpt-3.5-turbo-0125/48a0dd6b-9304-460a-8e4e-420c60dfa854.json +++ b/data/helm_mmlu/openai/gpt-3.5-turbo-0125/e9a41d4b-56c7-47f0-b439-72ad1e463000.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/openai_gpt-3.5-turbo-0125/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/openai_gpt-3.5-turbo-0125/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,16 +9,23 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "GPT-3.5 Turbo (0125)", + "name": "GPT-3.5 Turbo 0125", "id": "openai/gpt-3.5-turbo-0125", "developer": "openai", "inference_platform": "unknown" }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -65,132 +69,141 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -234,16 +247,25 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -287,16 +309,25 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -490,16 +521,25 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -543,16 +583,25 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -596,16 +645,25 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -649,16 +707,25 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -702,16 +769,25 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -755,16 +831,25 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -898,16 +983,25 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,16 +1045,25 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1004,16 +1107,25 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1057,16 +1169,25 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1110,16 +1231,25 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1163,16 +1293,25 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1216,16 +1355,25 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1269,16 +1417,25 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1322,16 +1479,25 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1765,16 +1931,25 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1848,16 +2023,25 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1901,16 +2085,25 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1954,16 +2147,25 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2007,16 +2209,25 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2060,16 +2271,25 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2113,16 +2333,25 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2166,16 +2395,25 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2219,16 +2457,25 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2302,16 +2549,25 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2355,16 +2611,25 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2408,16 +2673,25 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2461,16 +2735,25 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2514,16 +2797,25 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2567,16 +2859,25 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2921,25 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.493, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/openai/gpt-3.5-turbo-0613/1e1140d0-4dc9-4bb7-9560-6c9be1cbda29.json b/data/helm_mmlu/openai/gpt-3.5-turbo-0613/a2b7c0ec-fb74-4698-80ad-f054039ecb3f.json similarity index 78% rename from data/helm_mmlu/openai/gpt-3.5-turbo-0613/1e1140d0-4dc9-4bb7-9560-6c9be1cbda29.json rename to data/helm_mmlu/openai/gpt-3.5-turbo-0613/a2b7c0ec-fb74-4698-80ad-f054039ecb3f.json index a7037b692..92faf2169 100644 --- a/data/helm_mmlu/openai/gpt-3.5-turbo-0613/1e1140d0-4dc9-4bb7-9560-6c9be1cbda29.json +++ b/data/helm_mmlu/openai/gpt-3.5-turbo-0613/a2b7c0ec-fb74-4698-80ad-f054039ecb3f.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/openai_gpt-3.5-turbo-0613/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/openai_gpt-3.5-turbo-0613/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,16 +9,23 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "GPT-3.5 Turbo (0613)", + "name": "GPT-3.5 Turbo 0613", "id": "openai/gpt-3.5-turbo-0613", "developer": "openai", "inference_platform": "unknown" }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -65,132 +69,141 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -234,16 +247,25 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -287,16 +309,25 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -490,16 +521,25 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -543,16 +583,25 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -596,16 +645,25 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -649,16 +707,25 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -702,16 +769,25 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -755,16 +831,25 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -898,16 +983,25 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,16 +1045,25 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1004,16 +1107,25 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1057,16 +1169,25 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1110,16 +1231,25 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1163,16 +1293,25 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1216,16 +1355,25 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1269,16 +1417,25 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1322,16 +1479,25 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1765,16 +1931,25 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1848,16 +2023,25 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1901,16 +2085,25 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1954,16 +2147,25 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2007,16 +2209,25 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2060,16 +2271,25 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2113,16 +2333,25 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2166,16 +2395,25 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2219,16 +2457,25 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2302,16 +2549,25 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2355,16 +2611,25 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2408,16 +2673,25 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2461,16 +2735,25 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2514,16 +2797,25 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2567,16 +2859,25 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2921,25 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.589, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/openai/gpt-4-0613/8c587ab3-8a32-4cb1-aa67-63c2fb2b929f.json b/data/helm_mmlu/openai/gpt-4-0613/fd6aea24-dc18-41ce-bc19-23f461a39032.json similarity index 78% rename from data/helm_mmlu/openai/gpt-4-0613/8c587ab3-8a32-4cb1-aa67-63c2fb2b929f.json rename to data/helm_mmlu/openai/gpt-4-0613/fd6aea24-dc18-41ce-bc19-23f461a39032.json index 8a368f8b6..6ccc418f3 100644 --- a/data/helm_mmlu/openai/gpt-4-0613/8c587ab3-8a32-4cb1-aa67-63c2fb2b929f.json +++ b/data/helm_mmlu/openai/gpt-4-0613/fd6aea24-dc18-41ce-bc19-23f461a39032.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/openai_gpt-4-0613/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/openai_gpt-4-0613/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,16 +9,23 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "GPT-4 (0613)", + "name": "GPT-4 0613", "id": "openai/gpt-4-0613", "developer": "openai", "inference_platform": "unknown" }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -65,132 +69,141 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -234,16 +247,25 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -287,16 +309,25 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -490,16 +521,25 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -543,16 +583,25 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -596,16 +645,25 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -649,16 +707,25 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -702,16 +769,25 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -755,16 +831,25 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -898,16 +983,25 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,16 +1045,25 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1004,16 +1107,25 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1057,16 +1169,25 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1110,16 +1231,25 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1163,16 +1293,25 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1216,16 +1355,25 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1269,16 +1417,25 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1322,16 +1479,25 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1765,16 +1931,25 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1848,16 +2023,25 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1901,16 +2085,25 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1954,16 +2147,25 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2007,16 +2209,25 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2060,16 +2271,25 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2113,16 +2333,25 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2166,16 +2395,25 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2219,16 +2457,25 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2302,16 +2549,25 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2355,16 +2611,25 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2408,16 +2673,25 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2461,16 +2735,25 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2514,16 +2797,25 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2567,16 +2859,25 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2921,25 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.517, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/openai/gpt-4-1106-preview/174ad35c-d6b5-49bd-930c-9c83608213a9.json b/data/helm_mmlu/openai/gpt-4-1106-preview/625d33ce-a320-4bfd-a962-451b8c22d392.json similarity index 78% rename from data/helm_mmlu/openai/gpt-4-1106-preview/174ad35c-d6b5-49bd-930c-9c83608213a9.json rename to data/helm_mmlu/openai/gpt-4-1106-preview/625d33ce-a320-4bfd-a962-451b8c22d392.json index 41438331c..610be9719 100644 --- a/data/helm_mmlu/openai/gpt-4-1106-preview/174ad35c-d6b5-49bd-930c-9c83608213a9.json +++ b/data/helm_mmlu/openai/gpt-4-1106-preview/625d33ce-a320-4bfd-a962-451b8c22d392.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/openai_gpt-4-1106-preview/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/openai_gpt-4-1106-preview/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,16 +9,23 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "GPT-4 Turbo (1106 preview)", + "name": "GPT-4 Turbo 1106 preview", "id": "openai/gpt-4-1106-preview", "developer": "openai", "inference_platform": "unknown" }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -65,132 +69,141 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -234,16 +247,25 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -287,16 +309,25 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -490,16 +521,25 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -543,16 +583,25 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -596,16 +645,25 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -649,16 +707,25 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -702,16 +769,25 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -755,16 +831,25 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -898,16 +983,25 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,16 +1045,25 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1004,16 +1107,25 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1057,16 +1169,25 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1110,16 +1231,25 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1163,16 +1293,25 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1216,16 +1355,25 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1269,16 +1417,25 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1322,16 +1479,25 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1765,16 +1931,25 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1848,16 +2023,25 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1901,16 +2085,25 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1954,16 +2147,25 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2007,16 +2209,25 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2060,16 +2271,25 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2113,16 +2333,25 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2166,16 +2395,25 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2219,16 +2457,25 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2302,16 +2549,25 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2355,16 +2611,25 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2408,16 +2673,25 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2461,16 +2735,25 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2514,16 +2797,25 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2567,16 +2859,25 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2921,25 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.416, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/openai/gpt-4-turbo-2024-04-09/348bbc24-09de-4d1e-98bc-079e87fea558.json b/data/helm_mmlu/openai/gpt-4-turbo-2024-04-09/e51be257-610e-4d38-b58a-a3b29fc06a83.json similarity index 78% rename from data/helm_mmlu/openai/gpt-4-turbo-2024-04-09/348bbc24-09de-4d1e-98bc-079e87fea558.json rename to data/helm_mmlu/openai/gpt-4-turbo-2024-04-09/e51be257-610e-4d38-b58a-a3b29fc06a83.json index a7796e764..a348a9fb9 100644 --- a/data/helm_mmlu/openai/gpt-4-turbo-2024-04-09/348bbc24-09de-4d1e-98bc-079e87fea558.json +++ b/data/helm_mmlu/openai/gpt-4-turbo-2024-04-09/e51be257-610e-4d38-b58a-a3b29fc06a83.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/openai_gpt-4-turbo-2024-04-09/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/openai_gpt-4-turbo-2024-04-09/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,16 +9,23 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "GPT-4 Turbo (2024-04-09)", + "name": "GPT-4 Turbo 2024-04-09", "id": "openai/gpt-4-turbo-2024-04-09", "developer": "openai", "inference_platform": "unknown" }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -65,132 +69,141 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -234,16 +247,25 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -287,16 +309,25 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -490,16 +521,25 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -543,16 +583,25 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -596,16 +645,25 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -649,16 +707,25 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -702,16 +769,25 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -755,16 +831,25 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -898,16 +983,25 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,16 +1045,25 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1004,16 +1107,25 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1057,16 +1169,25 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1110,16 +1231,25 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1163,16 +1293,25 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1216,16 +1355,25 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1269,16 +1417,25 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1322,16 +1479,25 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1765,16 +1931,25 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1848,16 +2023,25 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1901,16 +2085,25 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1954,16 +2147,25 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2007,16 +2209,25 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2060,16 +2271,25 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2113,16 +2333,25 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2166,16 +2395,25 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2219,16 +2457,25 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2302,16 +2549,25 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2355,16 +2611,25 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2408,16 +2673,25 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2461,16 +2735,25 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2514,16 +2797,25 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2567,16 +2859,25 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2921,25 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.351, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/openai/gpt-4o-2024-05-13/f37fc452-58f2-4d80-a71c-9331f7fe549e.json b/data/helm_mmlu/openai/gpt-4o-2024-05-13/9e0b9f48-f913-4bbe-a135-59e596c9e479.json similarity index 78% rename from data/helm_mmlu/openai/gpt-4o-2024-05-13/f37fc452-58f2-4d80-a71c-9331f7fe549e.json rename to data/helm_mmlu/openai/gpt-4o-2024-05-13/9e0b9f48-f913-4bbe-a135-59e596c9e479.json index 1572c27c7..76ba53d53 100644 --- a/data/helm_mmlu/openai/gpt-4o-2024-05-13/f37fc452-58f2-4d80-a71c-9331f7fe549e.json +++ b/data/helm_mmlu/openai/gpt-4o-2024-05-13/9e0b9f48-f913-4bbe-a135-59e596c9e479.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/openai_gpt-4o-2024-05-13/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/openai_gpt-4o-2024-05-13/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,16 +9,23 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "GPT-4o (2024-05-13)", + "name": "GPT-4o 2024-05-13", "id": "openai/gpt-4o-2024-05-13", "developer": "openai", "inference_platform": "unknown" }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -65,132 +69,141 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -234,16 +247,25 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -287,16 +309,25 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -490,16 +521,25 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -543,16 +583,25 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -596,16 +645,25 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -649,16 +707,25 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -702,16 +769,25 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -755,16 +831,25 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -898,16 +983,25 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,16 +1045,25 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1004,16 +1107,25 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1057,16 +1169,25 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1110,16 +1231,25 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1163,16 +1293,25 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1216,16 +1355,25 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1269,16 +1417,25 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1322,16 +1479,25 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1765,16 +1931,25 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1848,16 +2023,25 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1901,16 +2085,25 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1954,16 +2147,25 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2007,16 +2209,25 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2060,16 +2271,25 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2113,16 +2333,25 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2166,16 +2395,25 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2219,16 +2457,25 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2302,16 +2549,25 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2355,16 +2611,25 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2408,16 +2673,25 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2461,16 +2735,25 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2514,16 +2797,25 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2567,16 +2859,25 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2921,25 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.671, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/openai/gpt-4o-2024-08-06/71df45d2-1a27-4ff2-853c-e853f809ff52.json b/data/helm_mmlu/openai/gpt-4o-2024-08-06/189e6cc5-1c8f-4712-8dda-c108f18f836d.json similarity index 78% rename from data/helm_mmlu/openai/gpt-4o-2024-08-06/71df45d2-1a27-4ff2-853c-e853f809ff52.json rename to data/helm_mmlu/openai/gpt-4o-2024-08-06/189e6cc5-1c8f-4712-8dda-c108f18f836d.json index 4ba84b207..2d538eb02 100644 --- a/data/helm_mmlu/openai/gpt-4o-2024-08-06/71df45d2-1a27-4ff2-853c-e853f809ff52.json +++ b/data/helm_mmlu/openai/gpt-4o-2024-08-06/189e6cc5-1c8f-4712-8dda-c108f18f836d.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/openai_gpt-4o-2024-08-06/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/openai_gpt-4o-2024-08-06/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,16 +9,23 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "GPT-4o (2024-08-06)", + "name": "GPT-4o 2024-08-06", "id": "openai/gpt-4o-2024-08-06", "developer": "openai", "inference_platform": "unknown" }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -65,132 +69,141 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -234,16 +247,25 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -287,16 +309,25 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -490,16 +521,25 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -543,16 +583,25 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -596,16 +645,25 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -649,16 +707,25 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -702,16 +769,25 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -755,16 +831,25 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -898,16 +983,25 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,16 +1045,25 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1004,16 +1107,25 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1057,16 +1169,25 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1110,16 +1231,25 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1163,16 +1293,25 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1216,16 +1355,25 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1269,16 +1417,25 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1322,16 +1479,25 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1765,16 +1931,25 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1848,16 +2023,25 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1901,16 +2085,25 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1954,16 +2147,25 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2007,16 +2209,25 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2060,16 +2271,25 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2113,16 +2333,25 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2166,16 +2395,25 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2219,16 +2457,25 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2302,16 +2549,25 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2355,16 +2611,25 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2408,16 +2673,25 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2461,16 +2735,25 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2514,16 +2797,25 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2567,16 +2859,25 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2921,25 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.52, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/openai/gpt-4o-mini-2024-07-18/7c049135-a8bc-46ca-9a85-cba23e8696fd.json b/data/helm_mmlu/openai/gpt-4o-mini-2024-07-18/4f043e7b-dfb5-4de5-a034-c4b0a335a8b3.json similarity index 78% rename from data/helm_mmlu/openai/gpt-4o-mini-2024-07-18/7c049135-a8bc-46ca-9a85-cba23e8696fd.json rename to data/helm_mmlu/openai/gpt-4o-mini-2024-07-18/4f043e7b-dfb5-4de5-a034-c4b0a335a8b3.json index f69b1b3d4..7753003a8 100644 --- a/data/helm_mmlu/openai/gpt-4o-mini-2024-07-18/7c049135-a8bc-46ca-9a85-cba23e8696fd.json +++ b/data/helm_mmlu/openai/gpt-4o-mini-2024-07-18/4f043e7b-dfb5-4de5-a034-c4b0a335a8b3.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/openai_gpt-4o-mini-2024-07-18/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/openai_gpt-4o-mini-2024-07-18/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,16 +9,23 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "GPT-4o mini (2024-07-18)", + "name": "GPT-4o mini 2024-07-18", "id": "openai/gpt-4o-mini-2024-07-18", "developer": "openai", "inference_platform": "unknown" }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -65,132 +69,141 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -234,16 +247,25 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -287,16 +309,25 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -490,16 +521,25 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -543,16 +583,25 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -596,16 +645,25 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -649,16 +707,25 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -702,16 +769,25 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -755,16 +831,25 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -898,16 +983,25 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,16 +1045,25 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1004,16 +1107,25 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1057,16 +1169,25 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1110,16 +1231,25 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1163,16 +1293,25 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1216,16 +1355,25 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1269,16 +1417,25 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1322,16 +1479,25 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1765,16 +1931,25 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1848,16 +2023,25 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1901,16 +2085,25 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1954,16 +2147,25 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2007,16 +2209,25 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2060,16 +2271,25 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2113,16 +2333,25 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2166,16 +2395,25 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2219,16 +2457,25 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2302,16 +2549,25 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2355,16 +2611,25 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2408,16 +2673,25 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2461,16 +2735,25 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2514,16 +2797,25 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2567,16 +2859,25 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2921,25 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.774, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/qwen/qwen1.5-110b-chat/69737d19-682b-494f-b10b-fb788e83076b.json b/data/helm_mmlu/qwen/qwen1.5-110b-chat/ff9b6c57-cadd-4d5d-92cb-62be63939b1b.json similarity index 78% rename from data/helm_mmlu/qwen/qwen1.5-110b-chat/69737d19-682b-494f-b10b-fb788e83076b.json rename to data/helm_mmlu/qwen/qwen1.5-110b-chat/ff9b6c57-cadd-4d5d-92cb-62be63939b1b.json index 190b1dce2..4b924f5af 100644 --- a/data/helm_mmlu/qwen/qwen1.5-110b-chat/69737d19-682b-494f-b10b-fb788e83076b.json +++ b/data/helm_mmlu/qwen/qwen1.5-110b-chat/ff9b6c57-cadd-4d5d-92cb-62be63939b1b.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/qwen_qwen1.5-110b-chat/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/qwen_qwen1.5-110b-chat/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,16 +9,23 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Qwen1.5 Chat (110B)", + "name": "Qwen1.5 Chat 110B", "id": "qwen/qwen1.5-110b-chat", "developer": "qwen", "inference_platform": "unknown" }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -65,132 +69,141 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -234,16 +247,25 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -287,16 +309,25 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -490,16 +521,25 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -543,16 +583,25 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -596,16 +645,25 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -649,16 +707,25 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -702,16 +769,25 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -755,16 +831,25 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -898,16 +983,25 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,16 +1045,25 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1004,16 +1107,25 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1057,16 +1169,25 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1110,16 +1231,25 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1163,16 +1293,25 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1216,16 +1355,25 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1269,16 +1417,25 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1322,16 +1479,25 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1765,16 +1931,25 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1848,16 +2023,25 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1901,16 +2085,25 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1954,16 +2147,25 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2007,16 +2209,25 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2060,16 +2271,25 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2113,16 +2333,25 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2166,16 +2395,25 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2219,16 +2457,25 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2302,16 +2549,25 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2355,16 +2611,25 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2408,16 +2673,25 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2461,16 +2735,25 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2514,16 +2797,25 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2567,16 +2859,25 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2921,25 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.875, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/qwen/qwen1.5-14b/c8de5fb0-5b1b-482f-b34a-d85e22e61bb9.json b/data/helm_mmlu/qwen/qwen1.5-14b/fa6a6772-671b-402e-9480-d61e0fb4a61e.json similarity index 78% rename from data/helm_mmlu/qwen/qwen1.5-14b/c8de5fb0-5b1b-482f-b34a-d85e22e61bb9.json rename to data/helm_mmlu/qwen/qwen1.5-14b/fa6a6772-671b-402e-9480-d61e0fb4a61e.json index 7ff151a72..9bfc87f91 100644 --- a/data/helm_mmlu/qwen/qwen1.5-14b/c8de5fb0-5b1b-482f-b34a-d85e22e61bb9.json +++ b/data/helm_mmlu/qwen/qwen1.5-14b/fa6a6772-671b-402e-9480-d61e0fb4a61e.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/qwen_qwen1.5-14b/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/qwen_qwen1.5-14b/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,16 +9,23 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Qwen1.5 (14B)", + "name": "Qwen1.5 14B", "id": "qwen/qwen1.5-14b", "developer": "qwen", "inference_platform": "unknown" }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -65,132 +69,141 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -234,16 +247,25 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -287,16 +309,25 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -490,16 +521,25 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -543,16 +583,25 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -596,16 +645,25 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -649,16 +707,25 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -702,16 +769,25 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -755,16 +831,25 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -898,16 +983,25 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,16 +1045,25 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1004,16 +1107,25 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1057,16 +1169,25 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1110,16 +1231,25 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1163,16 +1293,25 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1216,16 +1355,25 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1269,16 +1417,25 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1322,16 +1479,25 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1765,16 +1931,25 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1848,16 +2023,25 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1901,16 +2085,25 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1954,16 +2147,25 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2007,16 +2209,25 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2060,16 +2271,25 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2113,16 +2333,25 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2166,16 +2395,25 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2219,16 +2457,25 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2302,16 +2549,25 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2355,16 +2611,25 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2408,16 +2673,25 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2461,16 +2735,25 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2514,16 +2797,25 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2567,16 +2859,25 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2921,25 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.796, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/qwen/qwen1.5-32b/ed668c03-e5df-4871-b2fa-876b2cda62f3.json b/data/helm_mmlu/qwen/qwen1.5-32b/b5279e94-ae7f-4671-9315-874e162a24fd.json similarity index 78% rename from data/helm_mmlu/qwen/qwen1.5-32b/ed668c03-e5df-4871-b2fa-876b2cda62f3.json rename to data/helm_mmlu/qwen/qwen1.5-32b/b5279e94-ae7f-4671-9315-874e162a24fd.json index 421333da5..d1a9f19e1 100644 --- a/data/helm_mmlu/qwen/qwen1.5-32b/ed668c03-e5df-4871-b2fa-876b2cda62f3.json +++ b/data/helm_mmlu/qwen/qwen1.5-32b/b5279e94-ae7f-4671-9315-874e162a24fd.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/qwen_qwen1.5-32b/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/qwen_qwen1.5-32b/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,16 +9,23 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Qwen1.5 (32B)", + "name": "Qwen1.5 32B", "id": "qwen/qwen1.5-32b", "developer": "qwen", "inference_platform": "unknown" }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -65,132 +69,141 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -234,16 +247,25 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -287,16 +309,25 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -490,16 +521,25 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -543,16 +583,25 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -596,16 +645,25 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -649,16 +707,25 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -702,16 +769,25 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -755,16 +831,25 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -898,16 +983,25 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,16 +1045,25 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1004,16 +1107,25 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1057,16 +1169,25 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1110,16 +1231,25 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1163,16 +1293,25 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1216,16 +1355,25 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1269,16 +1417,25 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1322,16 +1479,25 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1765,16 +1931,25 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1848,16 +2023,25 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1901,16 +2085,25 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1954,16 +2147,25 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2007,16 +2209,25 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2060,16 +2271,25 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2113,16 +2333,25 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2166,16 +2395,25 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2219,16 +2457,25 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2302,16 +2549,25 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2355,16 +2611,25 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2408,16 +2673,25 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2461,16 +2735,25 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2514,16 +2797,25 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2567,16 +2859,25 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2921,25 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.624, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/qwen/qwen1.5-72b/c504b47e-e4eb-4d5e-a01a-7c2b4fd32757.json b/data/helm_mmlu/qwen/qwen1.5-72b/de00e8da-9c83-40df-b642-b94719ce1ac2.json similarity index 78% rename from data/helm_mmlu/qwen/qwen1.5-72b/c504b47e-e4eb-4d5e-a01a-7c2b4fd32757.json rename to data/helm_mmlu/qwen/qwen1.5-72b/de00e8da-9c83-40df-b642-b94719ce1ac2.json index d14327eec..94c5e4e80 100644 --- a/data/helm_mmlu/qwen/qwen1.5-72b/c504b47e-e4eb-4d5e-a01a-7c2b4fd32757.json +++ b/data/helm_mmlu/qwen/qwen1.5-72b/de00e8da-9c83-40df-b642-b94719ce1ac2.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/qwen_qwen1.5-72b/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/qwen_qwen1.5-72b/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,16 +9,23 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Qwen1.5 (72B)", + "name": "Qwen1.5 72B", "id": "qwen/qwen1.5-72b", "developer": "qwen", "inference_platform": "unknown" }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -65,132 +69,141 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -234,16 +247,25 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -287,16 +309,25 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -490,16 +521,25 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -543,16 +583,25 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -596,16 +645,25 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -649,16 +707,25 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -702,16 +769,25 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -755,16 +831,25 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -898,16 +983,25 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,16 +1045,25 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1004,16 +1107,25 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1057,16 +1169,25 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1110,16 +1231,25 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1163,16 +1293,25 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1216,16 +1355,25 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1269,16 +1417,25 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1322,16 +1479,25 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1765,16 +1931,25 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1848,16 +2023,25 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1901,16 +2085,25 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1954,16 +2147,25 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2007,16 +2209,25 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2060,16 +2271,25 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2113,16 +2333,25 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2166,16 +2395,25 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2219,16 +2457,25 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2302,16 +2549,25 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2355,16 +2611,25 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2408,16 +2673,25 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2461,16 +2735,25 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2514,16 +2797,25 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2567,16 +2859,25 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2921,25 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.65, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/qwen/qwen1.5-7b/1c743b00-0ca6-4332-9bb6-7f62190d74e3.json b/data/helm_mmlu/qwen/qwen1.5-7b/119b645f-04c8-4979-bff2-d1e4fdc2a7bc.json similarity index 78% rename from data/helm_mmlu/qwen/qwen1.5-7b/1c743b00-0ca6-4332-9bb6-7f62190d74e3.json rename to data/helm_mmlu/qwen/qwen1.5-7b/119b645f-04c8-4979-bff2-d1e4fdc2a7bc.json index d9688a597..166da7894 100644 --- a/data/helm_mmlu/qwen/qwen1.5-7b/1c743b00-0ca6-4332-9bb6-7f62190d74e3.json +++ b/data/helm_mmlu/qwen/qwen1.5-7b/119b645f-04c8-4979-bff2-d1e4fdc2a7bc.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/qwen_qwen1.5-7b/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/qwen_qwen1.5-7b/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,16 +9,23 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Qwen1.5 (7B)", + "name": "Qwen1.5 7B", "id": "qwen/qwen1.5-7b", "developer": "qwen", "inference_platform": "unknown" }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -65,132 +69,141 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -234,16 +247,25 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -287,16 +309,25 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -490,16 +521,25 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -543,16 +583,25 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -596,16 +645,25 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -649,16 +707,25 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -702,16 +769,25 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -755,16 +831,25 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -898,16 +983,25 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,16 +1045,25 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1004,16 +1107,25 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1057,16 +1169,25 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1110,16 +1231,25 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1163,16 +1293,25 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1216,16 +1355,25 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1269,16 +1417,25 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1322,16 +1479,25 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1765,16 +1931,25 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1848,16 +2023,25 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1901,16 +2085,25 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1954,16 +2147,25 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2007,16 +2209,25 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2060,16 +2271,25 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2113,16 +2333,25 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2166,16 +2395,25 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2219,16 +2457,25 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2302,16 +2549,25 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2355,16 +2611,25 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2408,16 +2673,25 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2461,16 +2735,25 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2514,16 +2797,25 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2567,16 +2859,25 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2921,25 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.843, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/qwen/qwen2-72b-instruct/7f9317d3-b2bc-481d-9b28-9f305612ac58.json b/data/helm_mmlu/qwen/qwen2-72b-instruct/80aabdf4-60b7-493b-98d8-1854f1c41c10.json similarity index 78% rename from data/helm_mmlu/qwen/qwen2-72b-instruct/7f9317d3-b2bc-481d-9b28-9f305612ac58.json rename to data/helm_mmlu/qwen/qwen2-72b-instruct/80aabdf4-60b7-493b-98d8-1854f1c41c10.json index abb62e63a..6f8b955e0 100644 --- a/data/helm_mmlu/qwen/qwen2-72b-instruct/7f9317d3-b2bc-481d-9b28-9f305612ac58.json +++ b/data/helm_mmlu/qwen/qwen2-72b-instruct/80aabdf4-60b7-493b-98d8-1854f1c41c10.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/qwen_qwen2-72b-instruct/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/qwen_qwen2-72b-instruct/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,16 +9,23 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Qwen2 Instruct (72B)", + "name": "Qwen2 Instruct 72B", "id": "qwen/qwen2-72b-instruct", "developer": "qwen", "inference_platform": "unknown" }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -65,132 +69,141 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -234,16 +247,25 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -287,16 +309,25 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -490,16 +521,25 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -543,16 +583,25 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -596,16 +645,25 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -649,16 +707,25 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -702,16 +769,25 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -755,16 +831,25 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -898,16 +983,25 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,16 +1045,25 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1004,16 +1107,25 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1057,16 +1169,25 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1110,16 +1231,25 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1163,16 +1293,25 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1216,16 +1355,25 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1269,16 +1417,25 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1322,16 +1479,25 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1765,16 +1931,25 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1848,16 +2023,25 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1901,16 +2085,25 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1954,16 +2147,25 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2007,16 +2209,25 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2060,16 +2271,25 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2113,16 +2333,25 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2166,16 +2395,25 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2219,16 +2457,25 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2302,16 +2549,25 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2355,16 +2611,25 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2408,16 +2673,25 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2461,16 +2735,25 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2514,16 +2797,25 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2567,16 +2859,25 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2921,25 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.826, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/qwen/qwen2.5-72b-instruct-turbo/7b3bc40a-a606-419d-b784-99697c1df5bc.json b/data/helm_mmlu/qwen/qwen2.5-72b-instruct-turbo/29958cee-32c9-4d51-8f14-72db4273459f.json similarity index 78% rename from data/helm_mmlu/qwen/qwen2.5-72b-instruct-turbo/7b3bc40a-a606-419d-b784-99697c1df5bc.json rename to data/helm_mmlu/qwen/qwen2.5-72b-instruct-turbo/29958cee-32c9-4d51-8f14-72db4273459f.json index ee06a7f3d..a61d620fd 100644 --- a/data/helm_mmlu/qwen/qwen2.5-72b-instruct-turbo/7b3bc40a-a606-419d-b784-99697c1df5bc.json +++ b/data/helm_mmlu/qwen/qwen2.5-72b-instruct-turbo/29958cee-32c9-4d51-8f14-72db4273459f.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/qwen_qwen2.5-72b-instruct-turbo/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/qwen_qwen2.5-72b-instruct-turbo/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,16 +9,23 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Qwen2.5 Instruct Turbo (72B)", + "name": "Qwen2.5 Instruct Turbo 72B", "id": "qwen/qwen2.5-72b-instruct-turbo", "developer": "qwen", "inference_platform": "unknown" }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -65,132 +69,141 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -234,16 +247,25 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -287,16 +309,25 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -490,16 +521,25 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -543,16 +583,25 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -596,16 +645,25 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -649,16 +707,25 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -702,16 +769,25 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -755,16 +831,25 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -898,16 +983,25 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,16 +1045,25 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1004,16 +1107,25 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1057,16 +1169,25 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1110,16 +1231,25 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1163,16 +1293,25 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1216,16 +1355,25 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1269,16 +1417,25 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1322,16 +1479,25 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1765,16 +1931,25 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1848,16 +2023,25 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1901,16 +2085,25 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1954,16 +2147,25 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2007,16 +2209,25 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2060,16 +2271,25 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2113,16 +2333,25 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2166,16 +2395,25 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2219,16 +2457,25 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2302,16 +2549,25 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2355,16 +2611,25 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2408,16 +2673,25 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2461,16 +2735,25 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2514,16 +2797,25 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2567,16 +2859,25 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2921,25 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.548, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/qwen/qwen2.5-7b-instruct-turbo/d7ac28f2-4c6d-44d9-9b87-b264df69a0cc.json b/data/helm_mmlu/qwen/qwen2.5-7b-instruct-turbo/72537b16-feda-4e5e-a477-f415650db847.json similarity index 78% rename from data/helm_mmlu/qwen/qwen2.5-7b-instruct-turbo/d7ac28f2-4c6d-44d9-9b87-b264df69a0cc.json rename to data/helm_mmlu/qwen/qwen2.5-7b-instruct-turbo/72537b16-feda-4e5e-a477-f415650db847.json index f8033410f..c045e519d 100644 --- a/data/helm_mmlu/qwen/qwen2.5-7b-instruct-turbo/d7ac28f2-4c6d-44d9-9b87-b264df69a0cc.json +++ b/data/helm_mmlu/qwen/qwen2.5-7b-instruct-turbo/72537b16-feda-4e5e-a477-f415650db847.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/qwen_qwen2.5-7b-instruct-turbo/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/qwen_qwen2.5-7b-instruct-turbo/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,16 +9,23 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Qwen2.5 Instruct Turbo (7B)", + "name": "Qwen2.5 Instruct Turbo 7B", "id": "qwen/qwen2.5-7b-instruct-turbo", "developer": "qwen", "inference_platform": "unknown" }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -65,132 +69,141 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -234,16 +247,25 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -287,16 +309,25 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -490,16 +521,25 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -543,16 +583,25 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -596,16 +645,25 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -649,16 +707,25 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -702,16 +769,25 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -755,16 +831,25 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -898,16 +983,25 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,16 +1045,25 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1004,16 +1107,25 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1057,16 +1169,25 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1110,16 +1231,25 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1163,16 +1293,25 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1216,16 +1355,25 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1269,16 +1417,25 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1322,16 +1479,25 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1765,16 +1931,25 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1848,16 +2023,25 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1901,16 +2085,25 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1954,16 +2147,25 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2007,16 +2209,25 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2060,16 +2271,25 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2113,16 +2333,25 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2166,16 +2395,25 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2219,16 +2457,25 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2302,16 +2549,25 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2355,16 +2611,25 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2408,16 +2673,25 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2461,16 +2735,25 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2514,16 +2797,25 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2567,16 +2859,25 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2921,25 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.887, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/snowflake/snowflake-arctic-instruct/cc68185c-6ee2-40bd-8951-f104d898c7f8.json b/data/helm_mmlu/snowflake/snowflake-arctic-instruct/7df68af5-667a-4125-9c12-e71fb5af0a74.json similarity index 78% rename from data/helm_mmlu/snowflake/snowflake-arctic-instruct/cc68185c-6ee2-40bd-8951-f104d898c7f8.json rename to data/helm_mmlu/snowflake/snowflake-arctic-instruct/7df68af5-667a-4125-9c12-e71fb5af0a74.json index cde071792..0afa77758 100644 --- a/data/helm_mmlu/snowflake/snowflake-arctic-instruct/cc68185c-6ee2-40bd-8951-f104d898c7f8.json +++ b/data/helm_mmlu/snowflake/snowflake-arctic-instruct/7df68af5-667a-4125-9c12-e71fb5af0a74.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/snowflake_snowflake-arctic-instruct/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/snowflake_snowflake-arctic-instruct/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -19,9 +16,16 @@ }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -65,132 +69,141 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -234,16 +247,25 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -287,16 +309,25 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -490,16 +521,25 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -543,16 +583,25 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -596,16 +645,25 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -649,16 +707,25 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -702,16 +769,25 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -755,16 +831,25 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -898,16 +983,25 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,16 +1045,25 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1004,16 +1107,25 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1057,16 +1169,25 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1110,16 +1231,25 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1163,16 +1293,25 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1216,16 +1355,25 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1269,16 +1417,25 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1322,16 +1479,25 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1765,16 +1931,25 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1848,16 +2023,25 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1901,16 +2085,25 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1954,16 +2147,25 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2007,16 +2209,25 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2060,16 +2271,25 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2113,16 +2333,25 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2166,16 +2395,25 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2219,16 +2457,25 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2302,16 +2549,25 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2355,16 +2611,25 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2408,16 +2673,25 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2461,16 +2735,25 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2514,16 +2797,25 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2567,16 +2859,25 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2921,25 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.565, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/upstage/solar-pro-241126/78ddc5dc-3f25-4ff6-96a1-b9b677d22f51.json b/data/helm_mmlu/upstage/solar-pro-241126/1845eb8b-4c94-4d22-8771-012f7230dc62.json similarity index 78% rename from data/helm_mmlu/upstage/solar-pro-241126/78ddc5dc-3f25-4ff6-96a1-b9b677d22f51.json rename to data/helm_mmlu/upstage/solar-pro-241126/1845eb8b-4c94-4d22-8771-012f7230dc62.json index 7d7fe6a40..2c0cfc48a 100644 --- a/data/helm_mmlu/upstage/solar-pro-241126/78ddc5dc-3f25-4ff6-96a1-b9b677d22f51.json +++ b/data/helm_mmlu/upstage/solar-pro-241126/1845eb8b-4c94-4d22-8771-012f7230dc62.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/upstage_solar-pro-241126/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/upstage_solar-pro-241126/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -19,9 +16,16 @@ }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -65,132 +69,141 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -234,16 +247,25 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -287,16 +309,25 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -490,16 +521,25 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -543,16 +583,25 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -596,16 +645,25 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -649,16 +707,25 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -702,16 +769,25 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -755,16 +831,25 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -898,16 +983,25 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,16 +1045,25 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1004,16 +1107,25 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1057,16 +1169,25 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1110,16 +1231,25 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1163,16 +1293,25 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1216,16 +1355,25 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1269,16 +1417,25 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1322,16 +1479,25 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1765,16 +1931,25 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1848,16 +2023,25 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1901,16 +2085,25 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1954,16 +2147,25 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2007,16 +2209,25 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2060,16 +2271,25 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2113,16 +2333,25 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2166,16 +2395,25 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2219,16 +2457,25 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2302,16 +2549,25 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2355,16 +2611,25 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2408,16 +2673,25 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2461,16 +2735,25 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2514,16 +2797,25 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2567,16 +2859,25 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2921,25 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.462, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/writer/palmyra-x-004/ba74f375-fd6d-4bba-af63-605bd73c9b7f.json b/data/helm_mmlu/writer/palmyra-x-004/b2c8cfd1-f09a-4616-8038-c7e1930bce74.json similarity index 78% rename from data/helm_mmlu/writer/palmyra-x-004/ba74f375-fd6d-4bba-af63-605bd73c9b7f.json rename to data/helm_mmlu/writer/palmyra-x-004/b2c8cfd1-f09a-4616-8038-c7e1930bce74.json index c2c0d493b..c204b253d 100644 --- a/data/helm_mmlu/writer/palmyra-x-004/ba74f375-fd6d-4bba-af63-605bd73c9b7f.json +++ b/data/helm_mmlu/writer/palmyra-x-004/b2c8cfd1-f09a-4616-8038-c7e1930bce74.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/writer_palmyra-x-004/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/writer_palmyra-x-004/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -19,9 +16,16 @@ }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -65,132 +69,141 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -234,16 +247,25 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -287,16 +309,25 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -490,16 +521,25 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -543,16 +583,25 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -596,16 +645,25 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -649,16 +707,25 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -702,16 +769,25 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -755,16 +831,25 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -898,16 +983,25 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,16 +1045,25 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1004,16 +1107,25 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1057,16 +1169,25 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1110,16 +1231,25 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1163,16 +1293,25 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1216,16 +1355,25 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1269,16 +1417,25 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1322,16 +1479,25 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1765,16 +1931,25 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1848,16 +2023,25 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1901,16 +2085,25 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1954,16 +2147,25 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2007,16 +2209,25 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2060,16 +2271,25 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2113,16 +2333,25 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2166,16 +2395,25 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2219,16 +2457,25 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2302,16 +2549,25 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2355,16 +2611,25 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2408,16 +2673,25 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2461,16 +2735,25 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2514,16 +2797,25 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2567,16 +2859,25 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2921,25 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.629, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/data/helm_mmlu/writer/palmyra-x-v3/41bf95f4-3c5b-4b33-ba3b-63ca32ae067f.json b/data/helm_mmlu/writer/palmyra-x-v3/12976629-cefe-4329-b974-bb17f88d385d.json similarity index 78% rename from data/helm_mmlu/writer/palmyra-x-v3/41bf95f4-3c5b-4b33-ba3b-63ca32ae067f.json rename to data/helm_mmlu/writer/palmyra-x-v3/12976629-cefe-4329-b974-bb17f88d385d.json index fd6405aa5..2eef769c8 100644 --- a/data/helm_mmlu/writer/palmyra-x-v3/41bf95f4-3c5b-4b33-ba3b-63ca32ae067f.json +++ b/data/helm_mmlu/writer/palmyra-x-v3/12976629-cefe-4329-b974-bb17f88d385d.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", - "evaluation_id": "helm_mmlu/writer_palmyra-x-v3/1767657487.397731", - "retrieved_timestamp": "1767657487.397731", - "source_data": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ], + "schema_version": "0.2.0", + "evaluation_id": "helm_mmlu/writer_palmyra-x-v3/1770835937.459157", + "retrieved_timestamp": "1770835937.459157", "source_metadata": { "source_name": "helm_mmlu", "source_type": "documentation", @@ -12,16 +9,23 @@ "evaluator_relationship": "third_party" }, "model_info": { - "name": "Palmyra X V3 (72B)", + "name": "Palmyra X V3 72B", "id": "writer/palmyra-x-v3", "developer": "writer", "inference_platform": "unknown" }, "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects - EM", + "evaluation_name": "MMLU All Subjects", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -65,132 +69,141 @@ } }, "generation_config": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] + "additional_details": { + "subject": [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions" + ], + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": [ + "mmlu_abstract_algebra", + "mmlu_anatomy", + "mmlu_astronomy", + "mmlu_business_ethics", + "mmlu_clinical_knowledge", + "mmlu_college_biology", + "mmlu_college_chemistry", + "mmlu_college_computer_science", + "mmlu_college_mathematics", + "mmlu_college_medicine", + "mmlu_college_physics", + "mmlu_computer_security", + "mmlu_conceptual_physics", + "mmlu_econometrics", + "mmlu_electrical_engineering", + "mmlu_elementary_mathematics", + "mmlu_formal_logic", + "mmlu_global_facts", + "mmlu_high_school_biology", + "mmlu_high_school_chemistry", + "mmlu_high_school_computer_science", + "mmlu_high_school_european_history", + "mmlu_high_school_geography", + "mmlu_high_school_government_and_politics", + "mmlu_high_school_macroeconomics", + "mmlu_high_school_mathematics", + "mmlu_high_school_microeconomics", + "mmlu_high_school_physics", + "mmlu_high_school_psychology", + "mmlu_high_school_statistics", + "mmlu_high_school_us_history", + "mmlu_high_school_world_history", + "mmlu_human_aging", + "mmlu_human_sexuality", + "mmlu_international_law", + "mmlu_jurisprudence", + "mmlu_logical_fallacies", + "mmlu_machine_learning", + "mmlu_management", + "mmlu_marketing", + "mmlu_medical_genetics", + "mmlu_miscellaneous", + "mmlu_moral_disputes", + "mmlu_moral_scenarios", + "mmlu_nutrition", + "mmlu_philosophy", + "mmlu_prehistory", + "mmlu_professional_accounting", + "mmlu_professional_law", + "mmlu_professional_medicine", + "mmlu_professional_psychology", + "mmlu_public_relations", + "mmlu_security_studies", + "mmlu_sociology", + "mmlu_us_foreign_policy", + "mmlu_virology", + "mmlu_world_religions" + ] + } } }, { - "evaluation_name": "Abstract Algebra - EM", + "evaluation_name": "Abstract Algebra", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -234,16 +247,25 @@ } }, "generation_config": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" + "additional_details": { + "subject": "abstract_algebra", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_abstract_algebra" + } } }, { - "evaluation_name": "Anatomy - EM", + "evaluation_name": "Anatomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -287,16 +309,25 @@ } }, "generation_config": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" + "additional_details": { + "subject": "anatomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_anatomy" + } } }, { - "evaluation_name": "College Physics - EM", + "evaluation_name": "College Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -490,16 +521,25 @@ } }, "generation_config": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" + "additional_details": { + "subject": "college_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_college_physics" + } } }, { - "evaluation_name": "Computer Security - EM", + "evaluation_name": "Computer Security", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -543,16 +583,25 @@ } }, "generation_config": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" + "additional_details": { + "subject": "computer_security", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_computer_security" + } } }, { - "evaluation_name": "Econometrics - EM", + "evaluation_name": "Econometrics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -596,16 +645,25 @@ } }, "generation_config": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" + "additional_details": { + "subject": "econometrics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_econometrics" + } } }, { - "evaluation_name": "Global Facts - EM", + "evaluation_name": "Global Facts", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -649,16 +707,25 @@ } }, "generation_config": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" + "additional_details": { + "subject": "global_facts", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_global_facts" + } } }, { - "evaluation_name": "Jurisprudence - EM", + "evaluation_name": "Jurisprudence", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -702,16 +769,25 @@ } }, "generation_config": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" + "additional_details": { + "subject": "jurisprudence", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_jurisprudence" + } } }, { - "evaluation_name": "Philosophy - EM", + "evaluation_name": "Philosophy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -755,16 +831,25 @@ } }, "generation_config": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" + "additional_details": { + "subject": "philosophy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_philosophy" + } } }, { - "evaluation_name": "Professional Psychology - EM", + "evaluation_name": "Professional Psychology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -898,16 +983,25 @@ } }, "generation_config": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" + "additional_details": { + "subject": "professional_psychology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_professional_psychology" + } } }, { - "evaluation_name": "Us Foreign Policy - EM", + "evaluation_name": "Us Foreign Policy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -951,16 +1045,25 @@ } }, "generation_config": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" + "additional_details": { + "subject": "us_foreign_policy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_us_foreign_policy" + } } }, { - "evaluation_name": "Astronomy - EM", + "evaluation_name": "Astronomy", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1004,16 +1107,25 @@ } }, "generation_config": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" + "additional_details": { + "subject": "astronomy", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_astronomy" + } } }, { - "evaluation_name": "Business Ethics - EM", + "evaluation_name": "Business Ethics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1057,16 +1169,25 @@ } }, "generation_config": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" + "additional_details": { + "subject": "business_ethics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_business_ethics" + } } }, { - "evaluation_name": "Clinical Knowledge - EM", + "evaluation_name": "Clinical Knowledge", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1110,16 +1231,25 @@ } }, "generation_config": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" + "additional_details": { + "subject": "clinical_knowledge", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_clinical_knowledge" + } } }, { - "evaluation_name": "Conceptual Physics - EM", + "evaluation_name": "Conceptual Physics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1163,16 +1293,25 @@ } }, "generation_config": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" + "additional_details": { + "subject": "conceptual_physics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_conceptual_physics" + } } }, { - "evaluation_name": "Electrical Engineering - EM", + "evaluation_name": "Electrical Engineering", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1216,16 +1355,25 @@ } }, "generation_config": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" + "additional_details": { + "subject": "electrical_engineering", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_electrical_engineering" + } } }, { - "evaluation_name": "Elementary Mathematics - EM", + "evaluation_name": "Elementary Mathematics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1269,16 +1417,25 @@ } }, "generation_config": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" + "additional_details": { + "subject": "elementary_mathematics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_elementary_mathematics" + } } }, { - "evaluation_name": "Formal Logic - EM", + "evaluation_name": "Formal Logic", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1322,16 +1479,25 @@ } }, "generation_config": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" + "additional_details": { + "subject": "formal_logic", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_formal_logic" + } } }, { - "evaluation_name": "High School World History - EM", + "evaluation_name": "High School World History", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1765,16 +1931,25 @@ } }, "generation_config": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" + "additional_details": { + "subject": "high_school_world_history", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_high_school_world_history" + } } }, { - "evaluation_name": "Human Sexuality - EM", + "evaluation_name": "Human Sexuality", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1848,16 +2023,25 @@ } }, "generation_config": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" + "additional_details": { + "subject": "human_sexuality", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_human_sexuality" + } } }, { - "evaluation_name": "International Law - EM", + "evaluation_name": "International Law", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1901,16 +2085,25 @@ } }, "generation_config": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" + "additional_details": { + "subject": "international_law", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_international_law" + } } }, { - "evaluation_name": "Logical Fallacies - EM", + "evaluation_name": "Logical Fallacies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -1954,16 +2147,25 @@ } }, "generation_config": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" + "additional_details": { + "subject": "logical_fallacies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_logical_fallacies" + } } }, { - "evaluation_name": "Machine Learning - EM", + "evaluation_name": "Machine Learning", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2007,16 +2209,25 @@ } }, "generation_config": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" + "additional_details": { + "subject": "machine_learning", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_machine_learning" + } } }, { - "evaluation_name": "Management - EM", + "evaluation_name": "Management", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2060,16 +2271,25 @@ } }, "generation_config": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" + "additional_details": { + "subject": "management", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_management" + } } }, { - "evaluation_name": "Marketing - EM", + "evaluation_name": "Marketing", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2113,16 +2333,25 @@ } }, "generation_config": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" + "additional_details": { + "subject": "marketing", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_marketing" + } } }, { - "evaluation_name": "Medical Genetics - EM", + "evaluation_name": "Medical Genetics", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2166,16 +2395,25 @@ } }, "generation_config": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" + "additional_details": { + "subject": "medical_genetics", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_medical_genetics" + } } }, { - "evaluation_name": "Miscellaneous - EM", + "evaluation_name": "Miscellaneous", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2219,16 +2457,25 @@ } }, "generation_config": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" + "additional_details": { + "subject": "miscellaneous", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_miscellaneous" + } } }, { - "evaluation_name": "Moral Scenarios - EM", + "evaluation_name": "Moral Scenarios", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2302,16 +2549,25 @@ } }, "generation_config": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" + "additional_details": { + "subject": "moral_scenarios", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_moral_scenarios" + } } }, { - "evaluation_name": "Nutrition - EM", + "evaluation_name": "Nutrition", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2355,16 +2611,25 @@ } }, "generation_config": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" + "additional_details": { + "subject": "nutrition", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_nutrition" + } } }, { - "evaluation_name": "Prehistory - EM", + "evaluation_name": "Prehistory", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2408,16 +2673,25 @@ } }, "generation_config": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" + "additional_details": { + "subject": "prehistory", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_prehistory" + } } }, { - "evaluation_name": "Public Relations - EM", + "evaluation_name": "Public Relations", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2461,16 +2735,25 @@ } }, "generation_config": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" + "additional_details": { + "subject": "public_relations", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_public_relations" + } } }, { - "evaluation_name": "Security Studies - EM", + "evaluation_name": "Security Studies", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2514,16 +2797,25 @@ } }, "generation_config": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" + "additional_details": { + "subject": "security_studies", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_security_studies" + } } }, { - "evaluation_name": "Sociology - EM", + "evaluation_name": "Sociology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2567,16 +2859,25 @@ } }, "generation_config": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" + "additional_details": { + "subject": "sociology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_sociology" + } } }, { - "evaluation_name": "Virology - EM", + "evaluation_name": "Virology", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2620,16 +2921,25 @@ } }, "generation_config": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" + "additional_details": { + "subject": "virology", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_virology" + } } }, { - "evaluation_name": "World Religions - EM", + "evaluation_name": "World Religions", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { - "evaluation_description": "The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, @@ -2673,14 +2983,23 @@ } }, "generation_config": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" + "additional_details": { + "subject": "world_religions", + "method": "multiple_choice_joint", + "eval_split": "test", + "groups": "mmlu_world_religions" + } } }, { "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_mmlu", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + ] + }, "metric_config": { "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, @@ -2691,11 +3010,12 @@ "score_details": { "score": 0.325, "details": { - "description": null, "tab": "Efficiency" } }, - "generation_config": {} + "generation_config": { + "additional_details": {} + } } ] } \ No newline at end of file diff --git a/scripts/HELM/parse_helm_leaderboards.sh b/scripts/HELM/parse_helm_leaderboards.sh new file mode 100755 index 000000000..a89a1a64e --- /dev/null +++ b/scripts/HELM/parse_helm_leaderboards.sh @@ -0,0 +1,9 @@ +uv run python3 -m scripts.HELM.adapter --leaderboard_name HELM_Capabilities --source_data_url https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json + +uv run python3 -m scripts.HELM.adapter --leaderboard_name HELM_Lite --source_data_url https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json + +uv run python3 -m scripts.HELM.adapter --leaderboard_name HELM_Classic --source_data_url https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json + +uv run python3 -m scripts.HELM.adapter --leaderboard_name HELM_Instruct --source_data_url https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json + +uv run python3 -m scripts.HELM.adapter --leaderboard_name HELM_MMLU --source_data_url https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json \ No newline at end of file diff --git a/utils/helm/adapter.py b/utils/helm/adapter.py index 3297cfac9..a3a7aca96 100644 --- a/utils/helm/adapter.py +++ b/utils/helm/adapter.py @@ -22,10 +22,12 @@ EvaluationLog, EvaluationResult, EvaluatorRelationship, + GenerationConfig, MetricConfig, ModelInfo, ScoreDetails, ScoreType, + SourceDataUrl ) import sys @@ -114,7 +116,7 @@ def extract_model_info_from_row(row: List[Dict[str, Any]], model_name: str) -> T else: spec = run_spec_names[0] args = spec.split(":", 1)[1].split(",") - + model_details = next( (arg.split("=", 1)[1] for arg in args if arg.startswith("model=")), "", @@ -126,12 +128,14 @@ def extract_model_info_from_row(row: List[Dict[str, Any]], model_name: str) -> T if developer == "unknown": developer = get_developer(model_name) - return make_model_info( + model_info = make_model_info( model_name=model_name, developer=developer, inference_platform="unknown", - ), model_id + ) + model_info.id = model_id + return model_info def find_column_ranges(tab_rows: List[List[Dict[str, Any]]]): """Determine min/max values for each metric column.""" @@ -152,7 +156,6 @@ def find_column_ranges(tab_rows: List[List[Dict[str, Any]]]): def convert( leaderboard_name: str, leaderboard_data: List[Dict[str, Any]], - source_data: List[str], ): """Convert HELM leaderboard data into unified evaluation logs.""" retrieved_timestamp = str(time.time()) @@ -172,9 +175,9 @@ def convert( model_name = row[0].get("value") if model_name not in model_infos: - model_info, model_id = extract_model_info_from_row(row, model_name) + model_info = extract_model_info_from_row(row, model_name) model_infos[model_name] = model_info - model_ids[model_name] = model_id + model_ids[model_name] = model_info.id for col_idx, (header, cell) in enumerate(zip(headers[1:], row[1:])): full_eval_name = header.get("value") @@ -190,9 +193,22 @@ def convert( or "instruct" in leaderboard_name.lower() ) + if full_eval_name.lower().startswith('mean'): + metric_name = None + dataset_name = leaderboard_name + evaluation_name = full_eval_name + else: + dataset_name, metric_name = full_eval_name.split(' - ', 1) + evaluation_name = dataset_name + + if metric_name: + evaluation_description = f'{metric_name} on {dataset_name}' + else: + evaluation_description = header.get("description") + if is_new_metric: metric_config = MetricConfig( - evaluation_description=header.get("description"), + evaluation_description=evaluation_description, lower_is_better=header.get("lower_is_better", False), min_score=( 0.0 if mins[col_idx] >= 0 else math.floor(mins[col_idx]) @@ -203,6 +219,14 @@ def convert( score_type=ScoreType.continuous, ) + source_dataset_name = leaderboard_name if leaderboard_name.lower() == 'helm_mmlu' else dataset_name + + source_data = SourceDataUrl( + dataset_name=source_dataset_name, + source_type='url', + url=[args.source_data_url] + ) + generation_config = ( extract_generation_config(cell.get("run_spec_names", [])) if cell.get("run_spec_names") @@ -210,7 +234,8 @@ def convert( ) model_results[model_name][short_name] = EvaluationResult( - evaluation_name=full_eval_name, + evaluation_name=evaluation_name, + source_data=source_data, metric_config=metric_config, score_details=ScoreDetails( score=round(cell.get("value"), 3) @@ -221,7 +246,9 @@ def convert( "tab": tab_name, }, ), - generation_config=generation_config, + generation_config=GenerationConfig( + additional_details=generation_config + ) ) else: # Add extra score details under the same metric @@ -232,12 +259,16 @@ def convert( else f"{full_eval_name} - {tab_name}" ) - existing.score_details.details[detail_key] = { - "description": cell.get("description"), - "tab": tab_name, - "score": cell.get("value"), - } - + setattr( + existing.score_details.details, + detail_key, + { + "description": cell.get("description"), + "tab": tab_name, + "score": cell.get("value"), + } + ) + # Save evaluation logs for model_name, results_by_metric in model_results.items(): model_info = model_infos[model_name] @@ -250,7 +281,7 @@ def convert( ) eval_log = EvaluationLog( - schema_version="0.1.0", + schema_version="0.2.0", evaluation_id=evaluation_id, retrieved_timestamp=retrieved_timestamp, source_metadata=make_source_metadata( @@ -259,7 +290,6 @@ def convert( evaluator_relationship=EvaluatorRelationship.third_party, ), model_info=model_info, - source_data=source_data, evaluation_results=list(results_by_metric.values()), ) @@ -287,15 +317,13 @@ def convert( args = parse_args() leaderboard_name = args.leaderboard_name.lower() - source_data = [args.source_data_url] print(f"Fetching {leaderboard_name} data from {args.source_data_url}") - leaderboard_data = fetch_json(source_data[0]) + leaderboard_data = fetch_json(args.source_data_url) convert( leaderboard_name=leaderboard_name, - leaderboard_data=leaderboard_data, - source_data=source_data, + leaderboard_data=leaderboard_data ) print("Done!")